net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.193 2000/04/20 14:41:16 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <net/inet_common.h>
  65 #include <linux/ipsec.h>
  66
  67 #ifdef CONFIG_SYSCTL
  68 #define SYNC_INIT 0 /* let the user enable it */
  69 #else
  70 #define SYNC_INIT 1
  71 #endif
  72
  73 /* These are on by default so the code paths get tested.
  74  * For the final 2.2 this may be undone at our discretion. -DaveM
  75  */
  76 int sysctl_tcp_timestamps = 1;
  77 int sysctl_tcp_window_scaling = 1;
  78 int sysctl_tcp_sack = 1;
  79
  80 int sysctl_tcp_syncookies = SYNC_INIT;
  81 int sysctl_tcp_stdurg;
  82 int sysctl_tcp_rfc1337;
  83 int sysctl_tcp_tw_recycle = 1;
  84 int sysctl_tcp_abort_on_overflow = 0;
  85 int sysctl_tcp_max_orphans = NR_FILE;
  86 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  87
  88 static int prune_queue(struct sock *sk);
  89
  90 /*
  91  * Adapt the MSS value used to make delayed ack decision to the
  92  * real world.
  93  *
  94  * The constant 536 hasn't any good meaning.  In IPv4 world
  95  * MTU may be smaller, though it contradicts to RFC1122, which
  96  * states that MSS must be at least 536.
  97  * We use the constant to do not ACK each second
  98  * packet in a stream of tiny size packets.
  99  * It means that super-low mtu links will be aggressively delacked.
 100  * Seems, it is even good. If they have so low mtu, they are weirdly
 101  * slow.
 102  *
 103  * AK: BTW it may be useful to add an option to lock the rcv_mss.
 104  *     this way the beowulf people wouldn't need ugly patches to get the
 105  *     ack frequencies they want and it would be an elegant way to tune delack.
 106  */
 107 static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb)
 108 {
 109         unsigned int len, lss;
 110
 111         lss = tp->ack.last_seg_size;
 112         tp->ack.last_seg_size = 0;
 113
 114         /* skb->len may jitter because of SACKs, even if peer
 115          * sends good full-sized frames.
 116          */
 117         len = skb->len;
 118         if (len >= tp->ack.rcv_mss) {
 119                 tp->ack.rcv_mss = len;
 120         } else {
 121                 /* Otherwise, we make more careful check taking into account,
 122                  * that SACKs block is variable.
 123                  *
 124                  * "len" is invariant segment length, including TCP header.
 125                  */
 126                 len = skb->tail - skb->h.raw;
 127                 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) {
 128                         /* Subtract also invariant (if peer is RFC compliant),
 129                          * tcp header plus fixed timestamp option length.
 130                          * Resulting "len" is MSS free of SACK jitter.
 131                          */
 132                         len -= tp->tcp_header_len;
 133                         if (len == lss)
 134                                 tp->ack.rcv_mss = len;
 135                         tp->ack.last_seg_size = len;
 136                 }
 137         }
 138 }
 139
 140
 141 static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp)
 142 {
 143         unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss);
 144
 145         tp->ack.quick = max(min(quickacks, 127), 1);
 146
 147         if (!tp->tstamp_ok && tp->ack.quick>2) {
 148                 /* Quick ACKs are _dangerous_, if RTTM is not used.
 149                  * See comment in tcp_init_metrics(). We still help
 150                  * them to overcome the most difficult, initial
 151                  * phase of slow start.
 152                  */
 153                 tp->ack.quick = 2;
 154         }
 155 }
 156
 157 /* Send ACKs quickly, if "quick" count is not ehausted
 158  * and the session is not interactive.
 159  */
 160
 161 static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp)
 162 {
 163         return (tp->ack.quick && !tp->ack.pingpong);
 164 }
 165
 166 /* There is something which you must keep in mind when you analyze the
 167  * behavior of the tp->ato delayed ack timeout interval.  When a
 168  * connection starts up, we want to ack as quickly as possible.  The
 169  * problem is that "good" TCP's do slow start at the beginning of data
 170  * transmission.  The means that until we send the first few ACK's the
 171  * sender will sit on his end and only queue most of his data, because
 172  * he can only send snd_cwnd unacked packets at any given time.  For
 173  * each ACK we send, he increments snd_cwnd and transmits more of his
 174  * queue.  -DaveM
 175  */
 176 static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb)
 177 {
 178         u32 now;
 179
 180         tcp_measure_rcv_mss(tp, skb);
 181
 182         tp->ack.pending = 1;
 183         tp->ack.rcv_segs++;
 184
 185         now = tcp_time_stamp;
 186
 187         if (!tp->ack.ato) {
 188                 /* The _first_ data packet received, initialize
 189                  * delayed ACK engine.
 190                  */
 191
 192                 /* Help sender leave slow start quickly. */
 193                 tcp_enter_quickack_mode(tp);
 194
 195                 /* Pingpong is off, session is not interactive by default */
 196                 tp->ack.pingpong = 0;
 197
 198                 /* ATO is minimal */
 199                 tp->ack.ato = TCP_ATO_MIN;
 200         } else {
 201                 int m = now - tp->ack.lrcvtime;
 202
 203                 if (m > TCP_ATO_MAX/2) {
 204                         /* Do not touch ATO, if interval is out of bounds.
 205                          * It will be deflated by delack timer, if our peer
 206                          * really sends too rarely.
 207                          */
 208                         if (m > tp->rto) {
 209                                 /* Too long gap. Apparently sender falled to
 210                                  * restart window, so that we send ACKs quickly.
 211                                  */
 212                                 tcp_enter_quickack_mode(tp);
 213                         }
 214                 } else {
 215                         if (m <= 0)
 216                                 m = TCP_ATO_MIN/2;
 217                         if (m <= tp->ack.ato)
 218                                 tp->ack.ato = (tp->ack.ato >> 1) + m;
 219                 }
 220         }
 221         tp->ack.lrcvtime = now;
 222 }
 223
 224 /* Called to compute a smoothed rtt estimate. The data fed to this
 225  * routine either comes from timestamps, or from segments that were
 226  * known _not_ to have been retransmitted [see Karn/Partridge
 227  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 228  * piece by Van Jacobson.
 229  * NOTE: the next three routines used to be one big routine.
 230  * To save cycles in the RFC 1323 implementation it was better to break
 231  * it up into three procedures. -- erics
 232  */
 233
 234 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 235 {
 236         long m = mrtt; /* RTT */
 237
 238         /*      The following amusing code comes from Jacobson's
 239          *      article in SIGCOMM '88.  Note that rtt and mdev
 240          *      are scaled versions of rtt and mean deviation.
 241          *      This is designed to be as fast as possible
 242          *      m stands for "measurement".
 243          *
 244          *      On a 1990 paper the rto value is changed to:
 245          *      RTO = rtt + 4 * mdev
 246          */
 247         if(m == 0)
 248                 m = 1;
 249         if (tp->srtt != 0) {
 250                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 251                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 252                 if (m < 0)
 253                         m = -m;         /* m is now abs(error) */
 254                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 255                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 256         } else {
 257                 /* no previous measure. */
 258                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 259                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 260         }
 261 }
 262
 263 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 264  * routine referred to above.
 265  */
 266
 267 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 268 {
 269         tp->rto = (tp->srtt >> 3) + tp->mdev;
 270         /* I am not enough educated to understand this magic.
 271          * However, it smells bad. snd_cwnd>31 is common case.
 272          */
 273         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 274 }
 275
 276
 277 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 278  * on packet lifetime in the internet. We need the HZ/5 lower
 279  * bound to behave correctly against BSD stacks with a fixed
 280  * delayed ack.
 281  * FIXME: It's not entirely clear this lower bound is the best
 282  * way to avoid the problem. Is it possible to drop the lower
 283  * bound and still avoid trouble with BSD stacks? Perhaps
 284  * some modification to the RTO calculation that takes delayed
 285  * ack bias into account? This needs serious thought. -- erics
 286  */
 287 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 288 {
 289         if (tp->rto < TCP_RTO_MIN)
 290                 tp->rto = TCP_RTO_MIN;
 291         else if (tp->rto > TCP_RTO_MAX)
 292                 tp->rto = TCP_RTO_MAX;
 293 }
 294
 295 /* Save metrics learned by this TCP session.
 296    This function is called only, when TCP finishes sucessfully
 297    i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 298  */
 299 static void tcp_update_metrics(struct sock *sk)
 300 {
 301         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 302         struct dst_entry *dst = __sk_dst_get(sk);
 303
 304         dst_confirm(dst);
 305
 306         if (dst && (dst->flags&DST_HOST)) {
 307                 int m;
 308
 309                 if (tp->backoff || !tp->srtt) {
 310                         /* This session failed to estimate rtt. Why?
 311                          * Probably, no packets returned in time.
 312                          * Reset our results.
 313                          */
 314                         if (!(dst->mxlock&(1<<RTAX_RTT)))
 315                                 dst->rtt = 0;
 316                         return;
 317                 }
 318
 319                 m = dst->rtt - tp->srtt;
 320
 321                 /* If newly calculated rtt larger than stored one,
 322                  * store new one. Otherwise, use EWMA. Remember,
 323                  * rtt overestimation is always better than underestimation.
 324                  */
 325                 if (!(dst->mxlock&(1<<RTAX_RTT))) {
 326                         if (m <= 0)
 327                                 dst->rtt = tp->srtt;
 328                         else
 329                                 dst->rtt -= (m>>3);
 330                 }
 331
 332                 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
 333                         if (m < 0)
 334                                 m = -m;
 335
 336                         /* Scale deviation to rttvar fixed point */
 337                         m >>= 1;
 338                         if (m < tp->mdev)
 339                                 m = tp->mdev;
 340
 341                         if (m >= dst->rttvar)
 342                                 dst->rttvar = m;
 343                         else
 344                                 dst->rttvar -= (dst->rttvar - m)>>2;
 345                 }
 346
 347                 if (tp->snd_ssthresh == 0x7FFFFFFF) {
 348                         /* Slow start still did not finish. */
 349                         if (dst->ssthresh &&
 350                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 351                             tp->snd_cwnd > dst->ssthresh)
 352                                 dst->ssthresh = tp->snd_cwnd;
 353                         if (!(dst->mxlock&(1<<RTAX_CWND)) &&
 354                             tp->snd_cwnd > dst->cwnd)
 355                                 dst->cwnd = tp->snd_cwnd;
 356                 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
 357                         /* Cong. avoidance phase, cwnd is reliable. */
 358                         if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
 359                                 dst->ssthresh = tp->snd_cwnd;
 360                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 361                                 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
 362                 } else {
 363                         /* Else slow start did not finish, cwnd is non-sense,
 364                            ssthresh may be also invalid.
 365                          */
 366                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 367                                 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
 368                         if (dst->ssthresh &&
 369                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 370                             tp->snd_ssthresh > dst->ssthresh)
 371                                 dst->ssthresh = tp->snd_ssthresh;
 372                 }
 373         }
 374 }
 375
 376 /* Initialize metrics on socket. */
 377
 378 static void tcp_init_metrics(struct sock *sk)
 379 {
 380         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 381         struct dst_entry *dst = __sk_dst_get(sk);
 382
 383         if (dst == NULL)
 384                 goto reset;
 385
 386         dst_confirm(dst);
 387
 388         if (dst->mxlock&(1<<RTAX_CWND))
 389                 tp->snd_cwnd_clamp = dst->cwnd;
 390         if (dst->ssthresh) {
 391                 tp->snd_ssthresh = dst->ssthresh;
 392                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 393                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
 394         }
 395
 396         if (dst->rtt == 0)
 397                 goto reset;
 398
 399         if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
 400                 goto reset;
 401
 402         /* Initial rtt is determined from SYN,SYN-ACK.
 403          * The segment is small and rtt may appear much
 404          * less than real one. Use per-dst memory
 405          * to make it more realistic.
 406          *
 407          * A bit of theory. RTT is time passed after "normal" sized packet
 408          * is sent until it is ACKed. In normal curcumstances sending small
 409          * packets force peer to delay ACKs and calculation is correct too.
 410          * The algorithm is adaptive and, provided we follow specs, it
 411          * NEVER underestimate RTT. BUT! If peer tries to make some clever
 412          * tricks sort of "quick acks" for time long enough to decrease RTT
 413          * to low value, and then abruptly stops to do it and starts to delay
 414          * ACKs, wait for troubles.
 415          */
 416         if (dst->rtt > tp->srtt)
 417                 tp->srtt = dst->rtt;
 418         if (dst->rttvar > tp->mdev)
 419                 tp->mdev = dst->rttvar;
 420         tcp_set_rto(tp);
 421         tcp_bound_rto(tp);
 422         if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
 423                 goto reset;
 424         tp->snd_cwnd = tcp_init_cwnd(tp);
 425         return;
 426
 427
 428 reset:
 429         /* Play conservative. If timestamps are not
 430          * supported, TCP will fail to recalculate correct
 431          * rtt, if initial rto is too small. FORGET ALL AND RESET!
 432          */
 433         if (!tp->saw_tstamp && tp->srtt) {
 434                 tp->srtt = 0;
 435                 tp->mdev = TCP_TIMEOUT_INIT;
 436                 tp->rto = TCP_TIMEOUT_INIT;
 437         }
 438 }
 439
 440 /* WARNING: this must not be called if tp->saw_tstamp was false. */
 441 extern __inline__ void
 442 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 443 {
 444         if (!after(seq, tp->rcv_wup)) {
 445                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 446                  * extra check below makes sure this can only happen
 447                  * for pure ACK frames.  -DaveM
 448                  *
 449                  * Not only, also it occurs for expired timestamps
 450                  * and RSTs with bad timestamp option. --ANK
 451                  */
 452
 453                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
 454                    xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) {
 455                         tp->ts_recent = tp->rcv_tsval;
 456                         tp->ts_recent_stamp = xtime.tv_sec;
 457                 }
 458         }
 459 }
 460
 461 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 462 {
 463         return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
 464                 xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS
 465
 466                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 467
 468                     I cannot see quitely as all the idea behind PAWS
 469                     is destroyed 8)
 470
 471                     The problem is only in reordering duplicate ACKs.
 472                     Hence, we can check this rare case more carefully.
 473
 474                     1. Check that it is really duplicate ACK (ack==snd_una)
 475                     2. Give it some small "replay" window (~RTO)
 476
 477                     We do not know units of foreign ts values, but make conservative
 478                     assumption that they are >=1ms. It solves problem
 479                     noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
 480                   */
 481                  && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
 482                      TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
 483                      !skb->h.th->ack ||
 484                      (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
 485 }
 486
 487
 488 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 489 {
 490         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 491 #ifdef TCP_FORMAL_WINDOW
 492         u32 rcv_wnd = tcp_receive_window(tp);
 493 #else
 494         u32 rcv_wnd = tp->rcv_wnd;
 495 #endif
 496
 497         if (rcv_wnd &&
 498             after(end_seq, tp->rcv_nxt) &&
 499             before(seq, end_window))
 500                 return 1;
 501         if (seq != end_window)
 502                 return 0;
 503         return (seq == end_seq);
 504 }
 505
 506 /* This functions checks to see if the tcp header is actually acceptable. */
 507 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 508 {
 509 #ifdef TCP_FORMAL_WINDOW
 510         u32 rcv_wnd = tcp_receive_window(tp);
 511 #else
 512         u32 rcv_wnd = tp->rcv_wnd;
 513 #endif
 514         if (seq == tp->rcv_nxt)
 515                 return (rcv_wnd || (end_seq == seq));
 516
 517         return __tcp_sequence(tp, seq, end_seq);
 518 }
 519
 520 /* When we get a reset we do this. */
 521 static void tcp_reset(struct sock *sk)
 522 {
 523         /* We want the right error as BSD sees it (and indeed as we do). */
 524         switch (sk->state) {
 525                 case TCP_SYN_SENT:
 526                         sk->err = ECONNREFUSED;
 527                         break;
 528                 case TCP_CLOSE_WAIT:
 529                         sk->err = EPIPE;
 530                         break;
 531                 case TCP_CLOSE:
 532                         return;
 533                 default:
 534                         sk->err = ECONNRESET;
 535         }
 536
 537         if (!sk->dead)
 538                 sk->error_report(sk);
 539
 540         tcp_done(sk);
 541 }
 542
 543 /* This tags the retransmission queue when SACKs arrive. */
 544 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 545 {
 546         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 547         int i = nsacks;
 548
 549         while(i--) {
 550                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 551                 __u32 start_seq = ntohl(sp->start_seq);
 552                 __u32 end_seq = ntohl(sp->end_seq);
 553                 int fack_count = 0;
 554
 555                 while((skb != NULL) &&
 556                       (skb != tp->send_head) &&
 557                       (skb != (struct sk_buff *)&sk->write_queue)) {
 558                         /* The retransmission queue is always in order, so
 559                          * we can short-circuit the walk early.
 560                          */
 561                         if(after(TCP_SKB_CB(skb)->seq, end_seq))
 562                                 break;
 563
 564                         /* We play conservative, we don't allow SACKS to partially
 565                          * tag a sequence space.
 566                          */
 567                         fack_count++;
 568                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 569                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 570                                 /* If this was a retransmitted frame, account for it. */
 571                                 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 572                                    tp->retrans_out)
 573                                         tp->retrans_out--;
 574                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 575
 576                                 /* RULE: All new SACKs will either decrease retrans_out
 577                                  *       or advance fackets_out.
 578                                  */
 579                                 if(fack_count > tp->fackets_out)
 580                                         tp->fackets_out = fack_count;
 581                         }
 582                         skb = skb->next;
 583                 }
 584                 sp++; /* Move on to the next SACK block. */
 585         }
 586 }
 587
 588 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 589  * But, this can also be called on packets in the established flow when
 590  * the fast version below fails.
 591  */
 592 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 593 {
 594         unsigned char *ptr;
 595         int length=(th->doff*4)-sizeof(struct tcphdr);
 596
 597         ptr = (unsigned char *)(th + 1);
 598         tp->saw_tstamp = 0;
 599
 600         while(length>0) {
 601                 int opcode=*ptr++;
 602                 int opsize;
 603
 604                 switch (opcode) {
 605                         case TCPOPT_EOL:
 606                                 return;
 607                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 608                                 length--;
 609                                 continue;
 610                         default:
 611                                 opsize=*ptr++;
 612                                 if (opsize < 2) /* "silly options" */
 613                                         return;
 614                                 if (opsize > length)
 615                                         break;  /* don't parse partial options */
 616                                 switch(opcode) {
 617                                 case TCPOPT_MSS:
 618                                         if(opsize==TCPOLEN_MSS && th->syn) {
 619                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 620                                                 if (in_mss) {
 621                                                         if (tp->user_mss && tp->user_mss < in_mss)
 622                                                                 in_mss = tp->user_mss;
 623                                                         tp->mss_clamp = in_mss;
 624                                                 }
 625                                         }
 626                                         break;
 627                                 case TCPOPT_WINDOW:
 628                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 629                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 630                                                         tp->wscale_ok = 1;
 631                                                         tp->snd_wscale = *(__u8 *)ptr;
 632                                                         if(tp->snd_wscale > 14) {
 633                                                                 if(net_ratelimit())
 634                                                                         printk("tcp_parse_options: Illegal window "
 635                                                                                "scaling value %d >14 received.",
 636                                                                                tp->snd_wscale);
 637                                                                 tp->snd_wscale = 14;
 638                                                         }
 639                                                 }
 640                                         break;
 641                                 case TCPOPT_TIMESTAMP:
 642                                         if(opsize==TCPOLEN_TIMESTAMP) {
 643                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 644                                                         tp->tstamp_ok = 1;
 645                                                         tp->saw_tstamp = 1;
 646                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 647                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 648                                                 }
 649                                         }
 650                                         break;
 651                                 case TCPOPT_SACK_PERM:
 652                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 653                                                 if (sysctl_tcp_sack && !no_fancy) {
 654                                                         tp->sack_ok = 1;
 655                                                         tp->num_sacks = 0;
 656                                                 }
 657                                         }
 658                                         break;
 659
 660                                 case TCPOPT_SACK:
 661                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 662                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 663                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 664
 665                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 666                                                         int num_sacks = sack_bytes >> 3;
 667                                                         struct tcp_sack_block *sackp;
 668
 669                                                         sackp = (struct tcp_sack_block *)ptr;
 670                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 671                                                 }
 672                                         }
 673                                 };
 674                                 ptr+=opsize-2;
 675                                 length-=opsize;
 676                 };
 677         }
 678 }
 679
 680 /* Fast parse options. This hopes to only see timestamps.
 681  * If it is wrong it falls back on tcp_parse_options().
 682  */
 683 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 684 {
 685         /* If we didn't send out any options ignore them all. */
 686         if (tp->tcp_header_len == sizeof(struct tcphdr))
 687                 return 0;
 688         if (th->doff == sizeof(struct tcphdr)>>2) {
 689                 tp->saw_tstamp = 0;
 690                 return 0;
 691         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 692                 __u32 *ptr = (__u32 *)(th + 1);
 693                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 694                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 695                         tp->saw_tstamp = 1;
 696                         ++ptr;
 697                         tp->rcv_tsval = ntohl(*ptr);
 698                         ++ptr;
 699                         tp->rcv_tsecr = ntohl(*ptr);
 700                         return 1;
 701                 }
 702         }
 703         tcp_parse_options(sk, th, tp, 0);
 704         return 1;
 705 }
 706
 707 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 708 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 709 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 710 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 711 #define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged new data.         */
 712
 713 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 714 {
 715         if (tp->dup_acks > 3)
 716                 tp->snd_cwnd = (tp->snd_ssthresh);
 717
 718         tp->dup_acks = 0;
 719 }
 720
 721 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 722  * retransmit timer fires.
 723  */
 724 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 725 {
 726         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 727
 728         /* Note: If not_dup is set this implies we got a
 729          * data carrying packet or a window update.
 730          * This carries no new information about possible
 731          * lost packets, so we have to ignore it for the purposes
 732          * of counting duplicate acks. Ideally this does not imply we
 733          * should stop our fast retransmit phase, more acks may come
 734          * later without data to help us. Unfortunately this would make
 735          * the code below much more complex. For now if I see such
 736          * a packet I clear the fast retransmit phase.
 737          */
 738         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 739                 /* This is the standard reno style fast retransmit branch. */
 740
 741                 /* 1. When the third duplicate ack is received, set ssthresh
 742                  * to one half the current congestion window, but no less
 743                  * than two segments. Retransmit the missing segment.
 744                  */
 745                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 746                         tp->dup_acks++;
 747                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 748                                 __tcp_enter_cong_avoid(tp);
 749                                 /* ... and account for 3 ACKs, which are
 750                                  * already received to this time.
 751                                  */
 752                                 tp->snd_cwnd += 3;
 753
 754                                 if(!tp->fackets_out)
 755                                         tcp_retransmit_skb(sk,
 756                                                            skb_peek(&sk->write_queue));
 757                                 else
 758                                         tcp_fack_retransmit(sk);
 759                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 760                         }
 761                 } else if (++tp->dup_acks > 3) {
 762                         /* 2. Each time another duplicate ACK arrives, increment
 763                          * cwnd by the segment size. [...] Transmit a packet...
 764                          *
 765                          * Packet transmission will be done on normal flow processing
 766                          * since we're not in "retransmit mode".  We do not use
 767                          * duplicate ACKs to artificially inflate the congestion
 768                          * window when doing FACK.
 769                          */
 770                         if(!tp->fackets_out) {
 771                                 tp->snd_cwnd++;
 772                         } else {
 773                                 /* Fill any further holes which may have
 774                                  * appeared.
 775                                  *
 776                                  * We may want to change this to run every
 777                                  * further multiple-of-3 dup ack increments,
 778                                  * to be more robust against out-of-order
 779                                  * packet delivery.  -DaveM
 780                                  */
 781                                 tcp_fack_retransmit(sk);
 782                         }
 783                 }
 784         } else if (tp->high_seq != 0) {
 785                 /* In this branch we deal with clearing the Floyd style
 786                  * block on duplicate fast retransmits, and if requested
 787                  * we do Hoe style secondary fast retransmits.
 788                  */
 789                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 790                         /* Once we have acked all the packets up to high_seq
 791                          * we are done this fast retransmit phase.
 792                          * Alternatively data arrived. In this case we
 793                          * Have to abort the fast retransmit attempt.
 794                          * Note that we do want to accept a window
 795                          * update since this is expected with Hoe's algorithm.
 796                          */
 797                         clear_fast_retransmit(tp);
 798
 799                         /* After we have cleared up to high_seq we can
 800                          * clear the Floyd style block.
 801                          */
 802                         if (!before(ack, tp->high_seq)) {
 803                                 tp->high_seq = 0;
 804                                 tp->fackets_out = 0;
 805                         }
 806                 } else if (tp->dup_acks >= 3) {
 807                         if (!tp->fackets_out) {
 808                                 /* Hoe Style. We didn't ack the whole
 809                                  * window. Take this as a cue that
 810                                  * another packet was lost and retransmit it.
 811                                  * Don't muck with the congestion window here.
 812                                  * Note that we have to be careful not to
 813                                  * act if this was a window update and it
 814                                  * didn't ack new data, since this does
 815                                  * not indicate a packet left the system.
 816                                  * We can test this by just checking
 817                                  * if ack changed from snd_una, since
 818                                  * the only way to get here without advancing
 819                                  * from snd_una is if this was a window update.
 820                                  */
 821                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 822                                         tcp_retransmit_skb(sk,
 823                                                            skb_peek(&sk->write_queue));
 824                                         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 825                                 }
 826                         } else {
 827                                 /* FACK style, fill any remaining holes in
 828                                  * receiver's queue.
 829                                  */
 830                                 tcp_fack_retransmit(sk);
 831                         }
 832                 }
 833         }
 834 }
 835
 836 /* This is Jacobson's slow start and congestion avoidance.
 837  * SIGCOMM '88, p. 328.
 838  */
 839 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 840 {
 841         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 842                 /* In "safe" area, increase. */
 843                 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 844                         tp->snd_cwnd++;
 845         } else {
 846                 /* In dangerous area, increase slowly.
 847                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 848                  */
 849                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 850                         if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 851                                 tp->snd_cwnd++;
 852                         tp->snd_cwnd_cnt=0;
 853                 } else
 854                         tp->snd_cwnd_cnt++;
 855         }
 856 }
 857
 858 /* Remove acknowledged frames from the retransmission queue. */
 859 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 860                                __u32 *seq, __u32 *seq_rtt)
 861 {
 862         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 863         struct sk_buff *skb;
 864         __u32 now = tcp_time_stamp;
 865         int acked = 0;
 866
 867         /* If we are retransmitting, and this ACK clears up to
 868          * the retransmit head, or further, then clear our state.
 869          */
 870         if (tp->retrans_head != NULL &&
 871             !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 872                 tp->retrans_head = NULL;
 873
 874         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 875                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 876                 __u8 sacked = scb->sacked;
 877
 878                 /* If our packet is before the ack sequence we can
 879                  * discard it as it's confirmed to have arrived at
 880                  * the other end.
 881                  */
 882                 if (after(scb->end_seq, ack))
 883                         break;
 884
 885                 /* Initial outgoing SYN's get put onto the write_queue
 886                  * just like anything else we transmit.  It is not
 887                  * true data, and if we misinform our callers that
 888                  * this ACK acks real data, we will erroneously exit
 889                  * connection startup slow start one packet too
 890                  * quickly.  This is severely frowned upon behavior.
 891                  */
 892                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 893                         tp->retrans_out--;
 894                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 895                         acked |= FLAG_DATA_ACKED;
 896                         if(sacked & TCPCB_SACKED_RETRANS)
 897                                 acked |= FLAG_RETRANS_DATA_ACKED;
 898                         if(tp->fackets_out)
 899                                 tp->fackets_out--;
 900                 } else {
 901                         acked |= FLAG_SYN_ACKED;
 902                         /* This is pure paranoia. */
 903                         tp->retrans_head = NULL;
 904                 }
 905                 tp->packets_out--;
 906                 *seq = scb->seq;
 907                 *seq_rtt = now - scb->when;
 908                 __skb_unlink(skb, skb->list);
 909                 kfree_skb(skb);
 910         }
 911         return acked;
 912 }
 913
 914 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 915 {
 916         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 917
 918         /* Was it a usable window open? */
 919
 920         if (tp->send_head != NULL) {
 921                 if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) {
 922                         tp->backoff = 0;
 923                         tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
 924                         /* If packets_out==0, socket must be waked up by
 925                          * subsequent tcp_data_snd_check(). This function is
 926                          * not for random using!
 927                          */
 928                 } else if (!tp->packets_out) {
 929                         tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
 930                                              min(tp->rto << tp->backoff, TCP_RTO_MAX));
 931                 }
 932         }
 933 }
 934
 935 /* Should we open up the congestion window? */
 936 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 937 {
 938         /* Data must have been acked. */
 939         if ((flag & FLAG_DATA_ACKED) == 0)
 940                 return 0;
 941
 942         /* Some of the data acked was retransmitted somehow? */
 943         if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 944                 /* We advance in all cases except during
 945                  * non-FACK fast retransmit/recovery.
 946                  */
 947                 if (tp->fackets_out != 0 ||
 948                     tp->retransmits != 0)
 949                         return 1;
 950
 951                 /* Non-FACK fast retransmit does it's own
 952                  * congestion window management, don't get
 953                  * in the way.
 954                  */
 955                 return 0;
 956         }
 957
 958         /* New non-retransmitted data acked, always advance.  */
 959         return 1;
 960 }
 961
 962 /* Read draft-ietf-tcplw-high-performance before mucking
 963  * with this code. (Superceeds RFC1323)
 964  */
 965 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 966                                u32 seq, u32 ack, int flag)
 967 {
 968         __u32 seq_rtt;
 969
 970         /* RTTM Rule: A TSecr value received in a segment is used to
 971          * update the averaged RTT measurement only if the segment
 972          * acknowledges some new data, i.e., only if it advances the
 973          * left edge of the send window.
 974          *
 975          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 976          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 977          */
 978         if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
 979                 return;
 980
 981         seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
 982         tcp_rtt_estimator(tp, seq_rtt);
 983         if (tp->retransmits) {
 984                 if (tp->packets_out == 0) {
 985                         tp->retransmits = 0;
 986                         tp->fackets_out = 0;
 987                         tp->retrans_out = 0;
 988                         tp->backoff = 0;
 989                         tcp_set_rto(tp);
 990                 } else {
 991                         /* Still retransmitting, use backoff */
 992                         tcp_set_rto(tp);
 993                         tp->rto = tp->rto << tp->backoff;
 994                 }
 995         } else {
 996                 tcp_set_rto(tp);
 997         }
 998
 999         tcp_bound_rto(tp);
1000 }
1001
1002 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
1003 {
1004         struct sk_buff *skb = skb_peek(&sk->write_queue);
1005
1006 #ifdef TCP_DEBUG
1007         /* It occured in 2.3, because of racy timers. Namely,
1008          * retransmit timer did not check packets_out and retransmitted
1009          * send_head sometimes and, hence, messed all the write_queue.
1010          * Now it is impossible, I bet. --ANK
1011          */
1012         if (skb == NULL) {
1013                 printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state);
1014                 return;
1015         }
1016 #endif
1017
1018         /* Some data was ACK'd, if still retransmitting (due to a
1019          * timeout), resend more of the retransmit queue.  The
1020          * congestion window is handled properly by that code.
1021          */
1022         if (tp->retransmits) {
1023                 tcp_xmit_retransmit_queue(sk);
1024                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1025         } else {
1026                 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
1027                 if ((__s32)when < 0)
1028                         when = 1;
1029                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
1030         }
1031 }
1032
1033 /* This routine deals with incoming acks, but not outgoing ones. */
1034 static int tcp_ack(struct sock *sk, struct tcphdr *th,
1035                    u32 ack_seq, u32 ack, int len)
1036 {
1037         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1038         int flag = 0;
1039         u32 seq = 0;
1040         u32 seq_rtt = 0;
1041
1042         if(sk->state == TCP_CLOSE)
1043                 return 1;       /* Dead, can't ack any more so why bother */
1044
1045         /* If the ack is newer than sent or older than previous acks
1046          * then we can probably ignore it.
1047          */
1048         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
1049                 goto uninteresting_ack;
1050
1051         /* If there is data set flag 1 */
1052         if (len != th->doff*4)
1053                 flag |= FLAG_DATA;
1054
1055         /* Update our send window. */
1056
1057         /* This is the window update code as per RFC 793
1058          * snd_wl{1,2} are used to prevent unordered
1059          * segments from shrinking the window
1060          */
1061         if (before(tp->snd_wl1, ack_seq) ||
1062             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
1063                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
1064
1065                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
1066                         flag |= FLAG_WIN_UPDATE;
1067                         if (tp->snd_wnd != nwin) {
1068                                 tp->snd_wnd = nwin;
1069
1070                                 /* Note, it is the only place, where
1071                                  * fast path is recovered for sending TCP.
1072                                  */
1073                                 if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
1074 #ifdef TCP_FORMAL_WINDOW
1075                                     tcp_receive_window(tp) &&
1076 #endif
1077                                     !tp->urg_data)
1078                                         tcp_fast_path_on(tp);
1079
1080                                 if (nwin > tp->max_window) {
1081                                         tp->max_window = nwin;
1082                                         tcp_sync_mss(sk, tp->pmtu_cookie);
1083                                 }
1084                         }
1085
1086                         tp->snd_wl1 = ack_seq;
1087                         tp->snd_wl2 = ack;
1088                 }
1089         }
1090
1091         /* BEWARE! From this place and until return from this function
1092          * snd_nxt and snd_wnd are out of sync. All the routines, called
1093          * from here must get "ack" as argument or they should not depend
1094          * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
1095          */
1096
1097         /* We passed data and got it acked, remove any soft error
1098          * log. Something worked...
1099          */
1100         sk->err_soft = 0;
1101         tp->probes_out = 0;
1102         tp->rcv_tstamp = tcp_time_stamp;
1103
1104         /* See if we can take anything off of the retransmit queue. */
1105         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
1106
1107         /* If this ack opens up a zero window, clear backoff.  It was
1108          * being used to time the probes, and is probably far higher than
1109          * it needs to be for normal retransmission.
1110          */
1111         if (tcp_timer_is_set(sk, TCP_TIME_PROBE0))
1112                 tcp_ack_probe(sk, ack);
1113
1114         /* We must do this here, before code below clears out important
1115          * state contained in tp->fackets_out and tp->retransmits.  -DaveM
1116          */
1117         if (should_advance_cwnd(tp, flag))
1118                 tcp_cong_avoid(tp);
1119
1120         /* If we have a timestamp, we always do rtt estimates. */
1121         if (tp->saw_tstamp) {
1122                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1123         } else {
1124                 /* If we were retransmiting don't count rtt estimate. */
1125                 if (tp->retransmits) {
1126                         if (tp->packets_out == 0) {
1127                                 tp->retransmits = 0;
1128                                 tp->fackets_out = 0;
1129                                 tp->retrans_out = 0;
1130                         }
1131                 } else {
1132                         /* We don't have a timestamp. Can only use
1133                          * packets that are not retransmitted to determine
1134                          * rtt estimates. Also, we must not reset the
1135                          * backoff for rto until we get a non-retransmitted
1136                          * packet. This allows us to deal with a situation
1137                          * where the network delay has increased suddenly.
1138                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1139                          */
1140                         if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1141                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1142                                         tp->backoff = 0;
1143                                         tcp_rtt_estimator(tp, seq_rtt);
1144                                         tcp_set_rto(tp);
1145                                         tcp_bound_rto(tp);
1146                                 }
1147                         }
1148                 }
1149         }
1150
1151         if (tp->packets_out) {
1152                 if (flag & FLAG_DATA_ACKED)
1153                         tcp_ack_packets_out(sk, tp);
1154         } else {
1155                 tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
1156         }
1157
1158         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1159         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1160             (tp->high_seq != 0)) {
1161                 tcp_fast_retrans(sk, ack, flag);
1162         } else {
1163                 /* Clear any aborted fast retransmit starts. */
1164                 tp->dup_acks = 0;
1165         }
1166         /* It is not a brain fart, I thought a bit now. 8)
1167          *
1168          * Forward progress is indicated, if:
1169          *   1. the ack acknowledges new data.
1170          *   2. or the ack is duplicate, but it is caused by new segment
1171          *      arrival. This case is filtered by:
1172          *      - it contains no data, syn or fin.
1173          *      - it does not update window.
1174          *   3. or new SACK. It is difficult to check, so that we ignore it.
1175          *
1176          * Forward progress is also indicated by arrival new data,
1177          * which was caused by window open from our side. This case is more
1178          * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1179          *                                              --ANK (990513)
1180          */
1181         if (ack != tp->snd_una || (flag == 0 && !th->fin))
1182                 dst_confirm(sk->dst_cache);
1183
1184         if (ack != tp->snd_una)
1185                 tp->sorry = 1;
1186
1187         /* Remember the highest ack received. */
1188         tp->snd_una = ack;
1189         return 1;
1190
1191 uninteresting_ack:
1192         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1193         return 0;
1194 }
1195
1196 int tcp_paws_check(struct tcp_opt *tp, int rst)
1197 {
1198         if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
1199                 return 0;
1200         if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
1201                 return 0;
1202
1203         /* RST segments are not recommended to carry timestamp,
1204            and, if they do, it is recommended to ignore PAWS because
1205            "their cleanup function should take precedence over timestamps."
1206            Certainly, it is mistake. It is necessary to understand the reasons
1207            of this constraint to relax it: if peer reboots, clock may go
1208            out-of-sync and half-open connections will not be reset.
1209            Actually, the problem would be not existing if all
1210            the implementations followed draft about maintaining clock
1211            via reboots. Linux-2.2 DOES NOT!
1212
1213            However, we can relax time bounds for RST segments to MSL.
1214          */
1215         if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
1216                 return 0;
1217         return 1;
1218 }
1219
1220 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
1221 {
1222         if (seq == s_win)
1223                 return 1;
1224         if (after(end_seq, s_win) && before(seq, e_win))
1225                 return 1;
1226         return (seq == e_win && seq == end_seq);
1227 }
1228
1229 /* New-style handling of TIME_WAIT sockets. */
1230
1231 /* Must be called with locally disabled BHs. */
1232 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1233 {
1234         struct tcp_ehash_bucket *ehead;
1235         struct tcp_bind_hashbucket *bhead;
1236         struct tcp_bind_bucket *tb;
1237
1238         /* Unlink from established hashes. */
1239         ehead = &tcp_ehash[tw->hashent];
1240         write_lock(&ehead->lock);
1241         if (!tw->pprev) {
1242                 write_unlock(&ehead->lock);
1243                 return;
1244         }
1245         if(tw->next)
1246                 tw->next->pprev = tw->pprev;
1247         *(tw->pprev) = tw->next;
1248         tw->pprev = NULL;
1249         write_unlock(&ehead->lock);
1250
1251         /* Disassociate with bind bucket. */
1252         bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1253         spin_lock(&bhead->lock);
1254         if ((tb = tw->tb) != NULL) {
1255                 if(tw->bind_next)
1256                         tw->bind_next->bind_pprev = tw->bind_pprev;
1257                 *(tw->bind_pprev) = tw->bind_next;
1258                 tw->tb = NULL;
1259                 if (tb->owners == NULL) {
1260                         if (tb->next)
1261                                 tb->next->pprev = tb->pprev;
1262                         *(tb->pprev) = tb->next;
1263                         kmem_cache_free(tcp_bucket_cachep, tb);
1264                 }
1265         }
1266         spin_unlock(&bhead->lock);
1267
1268 #ifdef INET_REFCNT_DEBUG
1269         if (atomic_read(&tw->refcnt) != 1) {
1270                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1271         }
1272 #endif
1273         tcp_tw_put(tw);
1274 }
1275
1276 /*
1277  * * Main purpose of TIME-WAIT state is to close connection gracefully,
1278  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1279  *   (and, probably, tail of data) and one or more our ACKs are lost.
1280  * * What is TIME-WAIT timeout? It is associated with maximal packet
1281  *   lifetime in the internet, which results in wrong conclusion, that
1282  *   it is set to catch "old duplicate segments" wandering out of their path.
1283  *   It is not quite correct. This timeout is calculated so that it exceeds
1284  *   maximal retransmision timeout enough to allow to lose one (or more)
1285  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
1286  * * When TIME-WAIT socket receives RST, it means that another end
1287  *   finally closed and we are allowed to kill TIME-WAIT too.
1288  * * Second purpose of TIME-WAIT is catching old duplicate segments.
1289  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
1290  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1291  * * If we invented some more clever way to catch duplicates
1292  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1293  *
1294  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1295  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1296  * from the very beginning.
1297  *
1298  * NOTE. With recycling (and later with fin-wait-2) TW bucket
1299  * is _not_ stateless. It means, that strictly speaking we must
1300  * spinlock it. I do not want! Well, probability of misbehaviour
1301  * is ridiculously low and, seems, we could use some mb() tricks
1302  * to avoid misread sequence numbers, states etc.  --ANK
1303  */
1304 enum tcp_tw_status
1305 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1306                            struct tcphdr *th, unsigned len)
1307 {
1308         struct tcp_opt tp;
1309         int paws_reject = 0;
1310
1311         tp.saw_tstamp = 0;
1312         if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1313                 tcp_parse_options(NULL, th, &tp, 0);
1314
1315                 if (tp.saw_tstamp) {
1316                         tp.ts_recent = tw->ts_recent;
1317                         tp.ts_recent_stamp = tw->ts_recent_stamp;
1318                         paws_reject = tcp_paws_check(&tp, th->rst);
1319                 }
1320         }
1321
1322         if (tw->substate == TCP_FIN_WAIT2) {
1323                 /* Just repeat all the checks of tcp_rcv_state_process() */
1324
1325                 /* Out of window, send ACK */
1326                 if (paws_reject ||
1327                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1328                                    tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
1329                         return TCP_TW_ACK;
1330
1331                 if (th->rst)
1332                         goto kill;
1333
1334                 if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq)
1335                         goto kill_with_rst;
1336
1337                 /* Dup ACK? */
1338                 if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) ||
1339                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
1340                         tcp_tw_put(tw);
1341                         return TCP_TW_SUCCESS;
1342                 }
1343
1344                 /* New data or FIN. If new data arrive after half-duplex close,
1345                  * reset.
1346                  */
1347                 if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
1348 kill_with_rst:
1349                         tcp_tw_deschedule(tw);
1350                         tcp_timewait_kill(tw);
1351                         tcp_tw_put(tw);
1352                         return TCP_TW_RST;
1353                 }
1354
1355                 /* FIN arrived, enter true time-wait state. */
1356                 tw->substate = TCP_TIME_WAIT;
1357                 tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1358                 if (tp.saw_tstamp) {
1359                         tw->ts_recent_stamp = xtime.tv_sec;
1360                         tw->ts_recent = tp.rcv_tsval;
1361                 }
1362
1363                 /* I am shamed, but failed to make it more elegant.
1364                  * Yes, it is direct reference to IP, which is impossible
1365                  * to generalize to IPv6. Taking into account that IPv6
1366                  * do not undertsnad recycling in any case, it not
1367                  * a big problem in practice. --ANK */
1368                 if (tw->family == AF_INET &&
1369                     sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
1370                     tcp_v4_tw_remember_stamp(tw))
1371                         tcp_tw_schedule(tw, tw->timeout);
1372                 else
1373                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1374                 return TCP_TW_ACK;
1375         }
1376
1377         /*
1378          *      Now real TIME-WAIT state.
1379          *
1380          *      RFC 1122:
1381          *      "When a connection is [...] on TIME-WAIT state [...]
1382          *      [a TCP] MAY accept a new SYN from the remote TCP to
1383          *      reopen the connection directly, if it:
1384          *
1385          *      (1)  assigns its initial sequence number for the new
1386          *      connection to be larger than the largest sequence
1387          *      number it used on the previous connection incarnation,
1388          *      and
1389          *
1390          *      (2)  returns to TIME-WAIT state if the SYN turns out
1391          *      to be an old duplicate".
1392          */
1393
1394         if (!paws_reject &&
1395             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1396              TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1397                 /* In window segment, it may be only reset or bare ack. */
1398
1399                 if (th->rst) {
1400                         /* This is TIME_WAIT assasination, in two flavors.
1401                          * Oh well... nobody has a sufficient solution to this
1402                          * protocol bug yet.
1403                          */
1404                         if (sysctl_tcp_rfc1337 == 0) {
1405 kill:
1406                                 tcp_tw_deschedule(tw);
1407                                 tcp_timewait_kill(tw);
1408                                 tcp_tw_put(tw);
1409                                 return TCP_TW_SUCCESS;
1410                         }
1411                 }
1412                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1413
1414                 if (tp.saw_tstamp) {
1415                         tw->ts_recent = tp.rcv_tsval;
1416                         tw->ts_recent_stamp = xtime.tv_sec;
1417                 }
1418
1419                 tcp_tw_put(tw);
1420                 return TCP_TW_SUCCESS;
1421         }
1422
1423         /* Out of window segment.
1424
1425            All the segments are ACKed immediately.
1426
1427            The only exception is new SYN. We accept it, if it is
1428            not old duplicate and we are not in danger to be killed
1429            by delayed old duplicates. RFC check is that it has
1430            newer sequence number works at rates <40Mbit/sec.
1431            However, if paws works, it is reliable AND even more,
1432            we even may relax silly seq space cutoff.
1433
1434            RED-PEN: we violate main RFC requirement, if this SYN will appear
1435            old duplicate (i.e. we receive RST in reply to SYN-ACK),
1436            we must return socket to time-wait state. It is not good,
1437            but not fatal yet.
1438          */
1439
1440         if (th->syn && !th->rst && !th->ack && !paws_reject &&
1441             (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1442              (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
1443                 u32 isn = tw->snd_nxt + 2;
1444                 if (isn == 0)
1445                         isn++;
1446                 TCP_SKB_CB(skb)->when = isn;
1447                 return TCP_TW_SYN;
1448         }
1449
1450         if (paws_reject)
1451                 NET_INC_STATS_BH(PAWSEstabRejected);
1452
1453         if(!th->rst) {
1454                 /* In this case we must reset the TIMEWAIT timer.
1455                  *
1456                  * If it is ACKless SYN it may be both old duplicate
1457                  * and new good SYN with random sequence number <rcv_nxt.
1458                  * Do not reschedule in the last case.
1459                  */
1460                 if (paws_reject || th->ack)
1461                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1462
1463                 /* Send ACK. Note, we do not put the bucket,
1464                  * it will be released by caller.
1465                  */
1466                 return TCP_TW_ACK;
1467         }
1468         tcp_tw_put(tw);
1469         return TCP_TW_SUCCESS;
1470 }
1471
1472 /* Enter the time wait state.  This is called with locally disabled BH.
1473  * Essentially we whip up a timewait bucket, copy the
1474  * relevant info into it from the SK, and mess with hash chains
1475  * and list linkage.
1476  */
1477 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1478 {
1479         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1480         struct tcp_bind_hashbucket *bhead;
1481         struct sock **head, *sktw;
1482
1483         write_lock(&ehead->lock);
1484
1485         /* Step 1: Remove SK from established hash. */
1486         if (sk->pprev) {
1487                 if(sk->next)
1488                         sk->next->pprev = sk->pprev;
1489                 *sk->pprev = sk->next;
1490                 sk->pprev = NULL;
1491                 sock_prot_dec_use(sk->prot);
1492         }
1493
1494         /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1495         head = &(ehead + tcp_ehash_size)->chain;
1496         sktw = (struct sock *)tw;
1497         if((sktw->next = *head) != NULL)
1498                 (*head)->pprev = &sktw->next;
1499         *head = sktw;
1500         sktw->pprev = head;
1501         atomic_inc(&tw->refcnt);
1502
1503         write_unlock(&ehead->lock);
1504
1505         /* Step 3: Put TW into bind hash. Original socket stays there too.
1506            Note, that any socket with sk->num!=0 MUST be bound in binding
1507            cache, even if it is closed.
1508          */
1509         bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1510         spin_lock(&bhead->lock);
1511         tw->tb = (struct tcp_bind_bucket *)sk->prev;
1512         BUG_TRAP(sk->prev!=NULL);
1513         if ((tw->bind_next = tw->tb->owners) != NULL)
1514                 tw->tb->owners->bind_pprev = &tw->bind_next;
1515         tw->tb->owners = (struct sock*)tw;
1516         tw->bind_pprev = &tw->tb->owners;
1517         spin_unlock(&bhead->lock);
1518 }
1519
1520 /*
1521  * Move a socket to time-wait or dead fin-wait-2 state.
1522  */
1523 void tcp_time_wait(struct sock *sk, int state, int timeo)
1524 {
1525         struct tcp_tw_bucket *tw = NULL;
1526         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1527         int recycle_ok = 0;
1528
1529         if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
1530                 recycle_ok = tp->af_specific->remember_stamp(sk);
1531
1532         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
1533                 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1534
1535         if(tw != NULL) {
1536                 int rto = (tp->rto<<2) - (tp->rto>>1);
1537
1538                 /* Give us an identity. */
1539                 tw->daddr       = sk->daddr;
1540                 tw->rcv_saddr   = sk->rcv_saddr;
1541                 tw->bound_dev_if= sk->bound_dev_if;
1542                 tw->num         = sk->num;
1543                 tw->state       = TCP_TIME_WAIT;
1544                 tw->substate    = state;
1545                 tw->sport       = sk->sport;
1546                 tw->dport       = sk->dport;
1547                 tw->family      = sk->family;
1548                 tw->reuse       = sk->reuse;
1549                 tw->rcv_wscale  = tp->rcv_wscale;
1550                 atomic_set(&tw->refcnt, 0);
1551
1552                 tw->hashent     = sk->hashent;
1553                 tw->rcv_nxt     = tp->rcv_nxt;
1554                 tw->snd_nxt     = tp->snd_nxt;
1555                 tw->rcv_wnd     = tcp_receive_window(tp);
1556                 tw->syn_seq     = tp->syn_seq;
1557                 tw->ts_recent   = tp->ts_recent;
1558                 tw->ts_recent_stamp= tp->ts_recent_stamp;
1559                 tw->pprev_death = NULL;
1560
1561 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1562                 if(tw->family == PF_INET6) {
1563                         memcpy(&tw->v6_daddr,
1564                                &sk->net_pinfo.af_inet6.daddr,
1565                                sizeof(struct in6_addr));
1566                         memcpy(&tw->v6_rcv_saddr,
1567                                &sk->net_pinfo.af_inet6.rcv_saddr,
1568                                sizeof(struct in6_addr));
1569                 }
1570 #endif
1571                 /* Linkage updates. */
1572                 __tcp_tw_hashdance(sk, tw);
1573
1574                 /* Get the TIME_WAIT timeout firing. */
1575                 if (timeo < rto)
1576                         timeo = rto;
1577
1578                 if (recycle_ok) {
1579                         tw->timeout = rto;
1580                 } else {
1581                         tw->timeout = TCP_TIMEWAIT_LEN;
1582                         if (state == TCP_TIME_WAIT)
1583                                 timeo = TCP_TIMEWAIT_LEN;
1584                 }
1585
1586                 tcp_tw_schedule(tw, timeo);
1587         } else {
1588                 /* Sorry, if we're out of memory, just CLOSE this
1589                  * socket up.  We've got bigger problems than
1590                  * non-graceful socket closings.
1591                  */
1592                 if (net_ratelimit())
1593                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
1594         }
1595
1596         tcp_update_metrics(sk);
1597         tcp_done(sk);
1598 }
1599
1600 /*
1601  *      Process the FIN bit. This now behaves as it is supposed to work
1602  *      and the FIN takes effect when it is validly part of sequence
1603  *      space. Not before when we get holes.
1604  *
1605  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1606  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1607  *      TIME-WAIT)
1608  *
1609  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1610  *      close and we go into CLOSING (and later onto TIME-WAIT)
1611  *
1612  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1613  */
1614
1615 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1616 {
1617         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1618
1619         tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
1620         tp->ack.pending = 1;
1621         tp->ack.quick = 0;
1622
1623         sk->shutdown |= RCV_SHUTDOWN;
1624
1625         switch(sk->state) {
1626                 case TCP_SYN_RECV:
1627                 case TCP_ESTABLISHED:
1628                         /* Move to CLOSE_WAIT */
1629                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1630                         break;
1631
1632                 case TCP_CLOSE_WAIT:
1633                 case TCP_CLOSING:
1634                         /* Received a retransmission of the FIN, do
1635                          * nothing.
1636                          */
1637                         break;
1638                 case TCP_LAST_ACK:
1639                         /* RFC793: Remain in the LAST-ACK state. */
1640                         break;
1641
1642                 case TCP_FIN_WAIT1:
1643                         /* This case occurs when a simultaneous close
1644                          * happens, we must ack the received FIN and
1645                          * enter the CLOSING state.
1646                          */
1647                         tcp_set_state(sk, TCP_CLOSING);
1648                         break;
1649                 case TCP_FIN_WAIT2:
1650                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1651                         tcp_send_ack(sk);
1652                         tcp_time_wait(sk, TCP_TIME_WAIT, 0);
1653                         break;
1654                 default:
1655                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1656                          * cases we should never reach this piece of code.
1657                          */
1658                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1659                         break;
1660         };
1661
1662         /* It _is_ possible, that we have something out-of-order _after_ FIN.
1663          * Probably, we should reset in this case. For now drop them.
1664          */
1665         __skb_queue_purge(&tp->out_of_order_queue);
1666         if (tp->sack_ok)
1667                 tp->num_sacks = 0;
1668
1669         if (!sk->dead) {
1670                 sk->state_change(sk);
1671
1672                 /* Do not send POLL_HUP for half duplex close. */
1673                 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
1674                         sk_wake_async(sk, 1, POLL_HUP);
1675                 else
1676                         sk_wake_async(sk, 1, POLL_IN);
1677         }
1678 }
1679
1680 /* These routines update the SACK block as out-of-order packets arrive or
1681  * in-order packets close up the sequence space.
1682  */
1683 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1684 {
1685         int this_sack, num_sacks = tp->num_sacks;
1686         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1687
1688         /* If more than one SACK block, see if the recent change to SP eats into
1689          * or hits the sequence space of other SACK blocks, if so coalesce.
1690          */
1691         if(num_sacks != 1) {
1692                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1693                         if(swalk == sp)
1694                                 continue;
1695
1696                         /* First case, bottom of SP moves into top of the
1697                          * sequence space of SWALK.
1698                          */
1699                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1700                                 sp->start_seq = swalk->start_seq;
1701                                 goto coalesce;
1702                         }
1703                         /* Second case, top of SP moves into bottom of the
1704                          * sequence space of SWALK.
1705                          */
1706                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1707                                 sp->end_seq = swalk->end_seq;
1708                                 goto coalesce;
1709                         }
1710                 }
1711         }
1712         /* SP is the only SACK, or no coalescing cases found. */
1713         return;
1714
1715 coalesce:
1716         /* Zap SWALK, by moving every further SACK up by one slot.
1717          * Decrease num_sacks.
1718          */
1719         for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1720                 struct tcp_sack_block *next = (swalk + 1);
1721                 swalk->start_seq = next->start_seq;
1722                 swalk->end_seq = next->end_seq;
1723         }
1724         tp->num_sacks--;
1725 }
1726
1727 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1728 {
1729         __u32 tmp;
1730
1731         tmp = sack1->start_seq;
1732         sack1->start_seq = sack2->start_seq;
1733         sack2->start_seq = tmp;
1734
1735         tmp = sack1->end_seq;
1736         sack1->end_seq = sack2->end_seq;
1737         sack2->end_seq = tmp;
1738 }
1739
1740 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1741 {
1742         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1743         struct tcp_sack_block *sp = &tp->selective_acks[0];
1744         int cur_sacks = tp->num_sacks;
1745
1746         if (!cur_sacks)
1747                 goto new_sack;
1748
1749         /* Optimize for the common case, new ofo frames arrive
1750          * "in order". ;-)  This also satisfies the requirements
1751          * of RFC2018 about ordering of SACKs.
1752          */
1753         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1754                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1755                 tcp_sack_maybe_coalesce(tp, sp);
1756         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1757                 /* Re-ordered arrival, in this case, can be optimized
1758                  * as well.
1759                  */
1760                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1761                 tcp_sack_maybe_coalesce(tp, sp);
1762         } else {
1763                 struct tcp_sack_block *swap = sp + 1;
1764                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1765
1766                 /* Oh well, we have to move things around.
1767                  * Try to find a SACK we can tack this onto.
1768                  */
1769
1770                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1771                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1772                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1773                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1774                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1775                                 else
1776                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1777                                 tcp_sack_swap(sp, swap);
1778                                 tcp_sack_maybe_coalesce(tp, sp);
1779                                 return;
1780                         }
1781                 }
1782
1783                 /* Could not find an adjacent existing SACK, build a new one,
1784                  * put it at the front, and shift everyone else down.  We
1785                  * always know there is at least one SACK present already here.
1786                  *
1787                  * If the sack array is full, forget about the last one.
1788                  */
1789                 if (cur_sacks >= max_sacks) {
1790                         cur_sacks--;
1791                         tp->num_sacks--;
1792                 }
1793                 while(cur_sacks >= 1) {
1794                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1795                         struct tcp_sack_block *prev = (this - 1);
1796                         this->start_seq = prev->start_seq;
1797                         this->end_seq = prev->end_seq;
1798                         cur_sacks--;
1799                 }
1800
1801         new_sack:
1802                 /* Build the new head SACK, and we're done. */
1803                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1804                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1805                 tp->num_sacks++;
1806         }
1807 }
1808
1809 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1810 {
1811         struct tcp_sack_block *sp = &tp->selective_acks[0];
1812         int num_sacks = tp->num_sacks;
1813         int this_sack;
1814
1815         /* This is an in order data segment _or_ an out-of-order SKB being
1816          * moved to the receive queue, so we know this removed SKB will eat
1817          * from the front of a SACK.
1818          */
1819         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1820                 /* Check if the start of the sack is covered by skb. */
1821                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1822                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1823                         break;
1824         }
1825
1826         /* This should only happen if so many SACKs get built that some get
1827          * pushed out before we get here, or we eat some in sequence packets
1828          * which are before the first SACK block.
1829          */
1830         if(this_sack >= num_sacks)
1831                 return;
1832
1833         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1834         if(!before(sp->start_seq, sp->end_seq)) {
1835                 /* Zap this SACK, by moving forward any other SACKS. */
1836                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1837                         struct tcp_sack_block *next = (sp + 1);
1838                         sp->start_seq = next->start_seq;
1839                         sp->end_seq = next->end_seq;
1840                 }
1841                 tp->num_sacks--;
1842         }
1843 }
1844
1845 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1846 {
1847         struct tcp_sack_block *sp = &tp->selective_acks[0];
1848         int num_sacks = tp->num_sacks;
1849         int this_sack;
1850
1851         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1852                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1853                         break;
1854         }
1855         if(this_sack >= num_sacks)
1856                 return;
1857         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1858 }
1859
1860
1861 /* This one checks to see if we can put data from the
1862  * out_of_order queue into the receive_queue.
1863  */
1864 static void tcp_ofo_queue(struct sock *sk)
1865 {
1866         struct sk_buff *skb;
1867         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1868
1869         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1870                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1871                         break;
1872
1873                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1874                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1875                         __skb_unlink(skb, skb->list);
1876                         kfree_skb(skb);
1877                         continue;
1878                 }
1879                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1880                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1881                            TCP_SKB_CB(skb)->end_seq);
1882
1883                 if(tp->sack_ok)
1884                         tcp_sack_remove_skb(tp, skb);
1885                 __skb_unlink(skb, skb->list);
1886                 __skb_queue_tail(&sk->receive_queue, skb);
1887                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1888                 if(skb->h.th->fin)
1889                         tcp_fin(skb, sk, skb->h.th);
1890         }
1891 }
1892
1893 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1894 {
1895         struct sk_buff *skb1;
1896         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1897         int eaten = 0;
1898
1899         /*  Queue data for delivery to the user.
1900          *  Packets in sequence go to the receive queue.
1901          *  Out of sequence packets to the out_of_order_queue.
1902          */
1903         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1904                 /* Ok. In sequence. */
1905                 if (tp->ucopy.task == current &&
1906                     tp->copied_seq == tp->rcv_nxt &&
1907                     tp->ucopy.len &&
1908                     sk->lock.users &&
1909                     !tp->urg_data) {
1910                         int chunk = min(skb->len, tp->ucopy.len);
1911
1912                         __set_current_state(TASK_RUNNING);
1913
1914                         local_bh_enable();
1915                         if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
1916                                 sk->err = EFAULT;
1917                                 sk->error_report(sk);
1918                         }
1919                         local_bh_disable();
1920                         tp->ucopy.len -= chunk;
1921                         tp->copied_seq += chunk;
1922                         eaten = (chunk == skb->len && !skb->h.th->fin);
1923                 }
1924
1925                 if (!eaten) {
1926 queue_and_out:
1927                         skb_set_owner_r(skb, sk);
1928                         __skb_queue_tail(&sk->receive_queue, skb);
1929                 }
1930                 dst_confirm(sk->dst_cache);
1931                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1932                 if(skb->len)
1933                         tcp_event_data_recv(tp, skb);
1934                 if(skb->h.th->fin)
1935                         tcp_fin(skb, sk, skb->h.th);
1936
1937                 /* This may have eaten into a SACK block. */
1938                 if(tp->sack_ok && tp->num_sacks)
1939                         tcp_sack_remove_skb(tp, skb);
1940                 tcp_ofo_queue(sk);
1941
1942                 /* Turn on fast path. */
1943                 if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
1944 #ifdef TCP_FORMAL_WINDOW
1945                     tcp_receive_window(tp) &&
1946 #endif
1947                     !tp->urg_data)
1948                         tcp_fast_path_on(tp);
1949
1950                 if (eaten) {
1951                         kfree_skb(skb);
1952                 } else if (!sk->dead)
1953                         sk->data_ready(sk, 0);
1954                 return;
1955         }
1956
1957         /* An old packet, either a retransmit or some packet got lost. */
1958         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1959                 /* A retransmit, 2nd most common case.  Force an imediate ack.
1960                  *
1961                  * It is impossible, seq is checked by top level.
1962                  */
1963                 NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq));
1964                 tcp_enter_quickack_mode(tp);
1965                 tp->ack.pending = 1;
1966                 kfree_skb(skb);
1967                 return;
1968         }
1969
1970         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1971                 /* Partial packet, seq < rcv_next < end_seq */
1972                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1973                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1974                            TCP_SKB_CB(skb)->end_seq);
1975
1976                 goto queue_and_out;
1977         }
1978
1979         /* Ok. This is an out_of_order segment, force an ack. */
1980         tp->ack.pending = 1;
1981
1982         /* Disable header prediction. */
1983         tp->pred_flags = 0;
1984
1985
1986         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1987                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1988
1989         skb_set_owner_r(skb, sk);
1990
1991         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1992                 /* Initial out of order segment, build 1 SACK. */
1993                 if(tp->sack_ok) {
1994                         tp->num_sacks = 1;
1995                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1996                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1997                 }
1998                 __skb_queue_head(&tp->out_of_order_queue,skb);
1999         } else {
2000                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
2001                         /* Already there. */
2002                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
2003                                 if (skb->len >= skb1->len) {
2004                                         if(tp->sack_ok)
2005                                                 tcp_sack_extend(tp, skb1, skb);
2006                                         __skb_append(skb1, skb);
2007                                         __skb_unlink(skb1, skb1->list);
2008                                         kfree_skb(skb1);
2009                                 } else {
2010                                         /* A duplicate, smaller than what is in the
2011                                          * out-of-order queue right now, toss it.
2012                                          */
2013                                         kfree_skb(skb);
2014                                 }
2015                                 break;
2016                         }
2017
2018                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
2019                                 __skb_append(skb1, skb);
2020                                 if(tp->sack_ok)
2021                                         tcp_sack_new_ofo_skb(sk, skb);
2022                                 break;
2023                         }
2024
2025                         /* See if we've hit the start. If so insert. */
2026                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
2027                                 __skb_queue_head(&tp->out_of_order_queue,skb);
2028                                 if(tp->sack_ok)
2029                                         tcp_sack_new_ofo_skb(sk, skb);
2030                                 break;
2031                         }
2032                 }
2033         }
2034         return;
2035 }
2036
2037
2038 /*
2039  *      This routine handles the data.  If there is room in the buffer,
2040  *      it will be have already been moved into it.  If there is no
2041  *      room, then we will just have to discard the packet.
2042  */
2043
2044 static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
2045 {
2046         struct tcphdr *th;
2047         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2048
2049         th = skb->h.th;
2050         skb_pull(skb, th->doff*4);
2051         skb_trim(skb, len - (th->doff*4));
2052
2053         if (skb->len == 0 && !th->fin)
2054                 goto drop;
2055
2056         /*
2057          *      If our receive queue has grown past its limits shrink it.
2058          *      Make sure to do this before moving rcv_nxt, otherwise
2059          *      data might be acked for that we don't have enough room.
2060          */
2061         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
2062                 if (prune_queue(sk) < 0) {
2063                         /* Still not enough room. That can happen when
2064                          * skb->true_size differs significantly from skb->len.
2065                          */
2066                         goto drop;
2067                 }
2068         }
2069
2070         tcp_data_queue(sk, skb);
2071
2072         if (before(tp->rcv_nxt, tp->copied_seq)) {
2073                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
2074                 tp->rcv_nxt = tp->copied_seq;
2075         }
2076         return;
2077
2078 drop:
2079         kfree_skb(skb);
2080 }
2081
2082 /* When incoming ACK allowed to free some skb from write_queue,
2083  * we remember this in flag tp->sorry and wake up socket on the exit
2084  * from tcp input handler. Probably, handler has already eat this space
2085  * sending ACK and cloned frames from tcp_write_xmit().
2086  */
2087 static __inline__ void tcp_new_space(struct sock *sk)
2088 {
2089         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2090         struct socket *sock;
2091
2092         tp->sorry = 0;
2093
2094         if (sock_wspace(sk) >= tcp_min_write_space(sk) &&
2095             (sock = sk->socket) != NULL) {
2096                 clear_bit(SOCK_NOSPACE, &sock->flags);
2097
2098                 if (sk->sleep && waitqueue_active(sk->sleep))
2099                         wake_up_interruptible(sk->sleep);
2100
2101                 if (sock->fasync_list)
2102                         sock_wake_async(sock, 2, POLL_OUT);
2103         }
2104 }
2105
2106 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
2107 {
2108         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2109
2110         if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
2111             tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
2112             tcp_write_xmit(sk))
2113                 tcp_check_probe_timer(sk, tp);
2114 }
2115
2116 static __inline__ void tcp_data_snd_check(struct sock *sk)
2117 {
2118         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
2119
2120         if (skb != NULL)
2121                 __tcp_data_snd_check(sk, skb);
2122 }
2123
2124 /*
2125  * Check if sending an ack is needed.
2126  */
2127 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
2128 {
2129         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2130
2131         /* This also takes care of updating the window.
2132          * This if statement needs to be simplified.
2133          *
2134          * Rules for delaying an ack:
2135          *      - delay time <= 0.5 HZ
2136          *      - we don't have a window update to send
2137          *      - must send at least every 2 full sized packets
2138          *      - must send an ACK if we have any out of order data
2139          *
2140          * With an extra heuristic to handle loss of packet
2141          * situations and also helping the sender leave slow
2142          * start in an expediant manner.
2143          */
2144
2145             /* More than one full frame received or... */
2146         if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
2147 #ifdef TCP_MORE_COARSE_ACKS
2148              /* Avoid to send immediate ACK from input path, if it
2149               * does not advance window far enough. tcp_recvmsg() will do this.
2150               */
2151              && (!sysctl_tcp_retrans_collapse || __tcp_select_window(sk) >= tp->rcv_wnd)
2152 #endif
2153              ) ||
2154             /* We ACK each frame or... */
2155             tcp_in_quickack_mode(tp) ||
2156             /* We have out of order data or */
2157             (ofo_possible &&
2158              skb_peek(&tp->out_of_order_queue) != NULL)) {
2159                 /* Then ack it now */
2160                 tcp_send_ack(sk);
2161         } else {
2162                 /* Else, send delayed ack. */
2163                 tcp_send_delayed_ack(sk);
2164         }
2165 }
2166
2167 static __inline__ void tcp_ack_snd_check(struct sock *sk)
2168 {
2169         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2170         if (tp->ack.pending == 0) {
2171                 /* We sent a data segment already. */
2172                 return;
2173         }
2174         __tcp_ack_snd_check(sk, 1);
2175 }
2176
2177
2178 /*
2179  *      This routine is only called when we have urgent data
2180  *      signalled. Its the 'slow' part of tcp_urg. It could be
2181  *      moved inline now as tcp_urg is only called from one
2182  *      place. We handle URGent data wrong. We have to - as
2183  *      BSD still doesn't use the correction from RFC961.
2184  *      For 1003.1g we should support a new option TCP_STDURG to permit
2185  *      either form (or just set the sysctl tcp_stdurg).
2186  */
2187
2188 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
2189 {
2190         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2191         u32 ptr = ntohs(th->urg_ptr);
2192
2193         if (ptr && !sysctl_tcp_stdurg)
2194                 ptr--;
2195         ptr += ntohl(th->seq);
2196
2197         /* Ignore urgent data that we've already seen and read. */
2198         if (after(tp->copied_seq, ptr))
2199                 return;
2200
2201         /* Do we already have a newer (or duplicate) urgent pointer? */
2202         if (tp->urg_data && !after(ptr, tp->urg_seq))
2203                 return;
2204
2205         /* Tell the world about our new urgent pointer. */
2206         if (sk->proc != 0) {
2207                 if (sk->proc > 0)
2208                         kill_proc(sk->proc, SIGURG, 1);
2209                 else
2210                         kill_pg(-sk->proc, SIGURG, 1);
2211                 sk_wake_async(sk, 3, POLL_PRI);
2212         }
2213
2214         /* We may be adding urgent data when the last byte read was
2215          * urgent. To do this requires some care. We cannot just ignore
2216          * tp->copied_seq since we would read the last urgent byte again
2217          * as data, nor can we alter copied_seq until this data arrives
2218          * or we break the sematics of SIOCATMARK (and thus sockatmark())
2219          */
2220         if (tp->urg_seq == tp->copied_seq)
2221                 tp->copied_seq++;       /* Move the copied sequence on correctly */
2222         tp->urg_data = TCP_URG_NOTYET;
2223         tp->urg_seq = ptr;
2224
2225         /* Disable header prediction. */
2226         tp->pred_flags = 0;
2227 }
2228
2229 /* This is the 'fast' part of urgent handling. */
2230 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
2231 {
2232         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2233
2234         /* Check if we get a new urgent pointer - normally not. */
2235         if (th->urg)
2236                 tcp_check_urg(sk,th);
2237
2238         /* Do we wait for any urgent data? - normally not... */
2239         if (tp->urg_data == TCP_URG_NOTYET) {
2240                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
2241
2242                 /* Is the urgent pointer pointing into this packet? */
2243                 if (ptr < len) {
2244                         tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
2245                         if (!sk->dead)
2246                                 sk->data_ready(sk,0);
2247                 }
2248         }
2249 }
2250
2251 /* Clean the out_of_order queue if we can, trying to get
2252  * the socket within its memory limits again.
2253  *
2254  * Return less than zero if we should start dropping frames
2255  * until the socket owning process reads some of the data
2256  * to stabilize the situation.
2257  */
2258 static int prune_queue(struct sock *sk)
2259 {
2260         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2261         struct sk_buff *skb;
2262         int pruned = 0;
2263
2264         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2265
2266         NET_INC_STATS_BH(PruneCalled);
2267
2268         /* First, purge the out_of_order queue. */
2269         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2270         if(skb != NULL) {
2271                 /* Free it all. */
2272                 do {
2273                         pruned += skb->len;
2274                         net_statistics[smp_processor_id()*2].OfoPruned += skb->len;
2275                         kfree_skb(skb);
2276                         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2277                 } while(skb != NULL);
2278
2279                 /* Reset SACK state.  A conforming SACK implementation will
2280                  * do the same at a timeout based retransmit.  When a connection
2281                  * is in a sad state like this, we care only about integrity
2282                  * of the connection not performance.
2283                  */
2284                 if(tp->sack_ok)
2285                         tp->num_sacks = 0;
2286         }
2287
2288         /* If we are really being abused, tell the caller to silently
2289          * drop receive data on the floor.  It will get retransmitted
2290          * and hopefully then we'll have sufficient space.
2291          *
2292          * We used to try to purge the in-order packets too, but that
2293          * turns out to be deadly and fraught with races.  Consider:
2294          *
2295          * 1) If we acked the data, we absolutely cannot drop the
2296          *    packet.  This data would then never be retransmitted.
2297          * 2) It is possible, with a proper sequence of events involving
2298          *    delayed acks and backlog queue handling, to have the user
2299          *    read the data before it gets acked.  The previous code
2300          *    here got this wrong, and it lead to data corruption.
2301          * 3) Too much state changes happen when the FIN arrives, so once
2302          *    we've seen that we can't remove any in-order data safely.
2303          *
2304          * The net result is that removing in-order receive data is too
2305          * complex for anyones sanity.  So we don't do it anymore.  But
2306          * if we are really having our buffer space abused we stop accepting
2307          * new receive data.
2308          *
2309          * 8) The arguments are interesting, but I even cannot imagine
2310          * what kind of arguments could force us to drop NICE, ALREADY
2311          * RECEIVED DATA only to get one more packet? --ANK
2312          *
2313          * FIXME: it should recompute SACK state and only remove enough
2314          *        buffers to get into bounds again. The current scheme loses
2315          *        badly sometimes on links with large RTT, especially when
2316          *        the driver has high overhead per skb.
2317          *        (increasing the rcvbuf is not enough because it inflates the
2318          *         the window too, disabling flow control effectively) -AK
2319          *
2320          *        Mmm... Why not to scale it seprately then? Just replace
2321          *        / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
2322          *        and adjust it dynamically, when TCP window flow control
2323          *        fails?                                                -ANK
2324          */
2325
2326         tp->ack.quick = 0;
2327
2328         if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2329                 return 0;
2330
2331         NET_INC_STATS_BH(RcvPruned);
2332
2333         /* Massive buffer overcommit. */
2334         return -1;
2335 }
2336
2337 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
2338 {
2339         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2340         int chunk = skb->len - hlen;
2341         int err;
2342
2343         local_bh_enable();
2344         if (skb->ip_summed==CHECKSUM_UNNECESSARY)
2345                 err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
2346         else
2347                 err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
2348
2349         if (!err) {
2350 update:
2351                 tp->ucopy.len -= chunk;
2352                 tp->copied_seq += chunk;
2353                 local_bh_disable();
2354                 return 0;
2355         }
2356
2357         if (err == -EFAULT) {
2358                 sk->err = EFAULT;
2359                 sk->error_report(sk);
2360                 goto update;
2361         }
2362
2363         local_bh_disable();
2364         return err;
2365 }
2366
2367 static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
2368 {
2369         int result;
2370
2371         if (sk->lock.users) {
2372                 local_bh_enable();
2373                 result = __tcp_checksum_complete(skb);
2374                 local_bh_disable();
2375         } else {
2376                 result = __tcp_checksum_complete(skb);
2377         }
2378         return result;
2379 }
2380
2381 static __inline__ int
2382 tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
2383 {
2384         return skb->ip_summed != CHECKSUM_UNNECESSARY &&
2385                 __tcp_checksum_complete_user(sk, skb);
2386 }
2387
2388 /*
2389  *      TCP receive function for the ESTABLISHED state.
2390  *
2391  *      It is split into a fast path and a slow path. The fast path is
2392  *      disabled when:
2393  *      - A zero window was announced from us - zero window probing
2394  *        is only handled properly in the slow path.
2395  *        [ NOTE: actually, it was made incorrectly and nobody ever noticed
2396  *          this! Reason is clear: 1. Correct senders do not send
2397  *          to zero window. 2. Even if a sender sends to zero window,
2398  *          nothing terrible occurs.
2399  *
2400  *          For now I cleaned this and fast path is really always disabled,
2401  *          when window is zero, but I would be more happy to remove these
2402  *          checks. Code will be only cleaner and _faster_.    --ANK
2403  *
2404  *          Later note. I've just found that slow path also accepts
2405  *          out of window segments, look at tcp_sequence(). So...
2406  *          it is the last argument: I repair all and comment out
2407  *          repaired code by TCP_FORMAL_WINDOW.
2408  *          [ I remember one rhyme from a chidren's book. (I apologize,
2409  *            the trasnlation is not rhymed 8)): people in one (jewish) village
2410  *            decided to build sauna, but divided to two parties.
2411  *            The first one insisted that battens should not be dubbed,
2412  *            another objected that foots will suffer of splinters,
2413  *            the first fended that dubbed wet battens are too slippy
2414  *            and people will fall and it is much more serious!
2415  *            Certaiinly, all they went to rabbi.
2416  *            After some thinking, he judged: "Do not be lazy!
2417  *            Certainly, dub the battens! But put them by dubbed surface down."
2418  *          ]
2419  *        ]
2420  *
2421  *      - Out of order segments arrived.
2422  *      - Urgent data is expected.
2423  *      - There is no buffer space left
2424  *      - Unexpected TCP flags/window values/header lengths are received
2425  *        (detected by checking the TCP header against pred_flags)
2426  *      - Data is sent in both directions. Fast path only supports pure senders
2427  *        or pure receivers (this means either the sequence number or the ack
2428  *        value must stay constant)
2429  *      - Unexpected TCP option.
2430  *
2431  *      When these conditions are not satisfied it drops into a standard
2432  *      receive procedure patterned after RFC793 to handle all cases.
2433  *      The first three cases are guaranteed by proper pred_flags setting,
2434  *      the rest is checked inline. Fast processing is turned on in
2435  *      tcp_data_queue when everything is OK.
2436  */
2437 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2438                         struct tcphdr *th, unsigned len)
2439 {
2440         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2441
2442         /*
2443          *      Header prediction.
2444          *      The code losely follows the one in the famous
2445          *      "30 instruction TCP receive" Van Jacobson mail.
2446          *
2447          *      Van's trick is to deposit buffers into socket queue
2448          *      on a device interrupt, to call tcp_recv function
2449          *      on the receive process context and checksum and copy
2450          *      the buffer to user space. smart...
2451          *
2452          *      Our current scheme is not silly either but we take the
2453          *      extra cost of the net_bh soft interrupt processing...
2454          *      We do checksum and copy also but from device to kernel.
2455          */
2456
2457         /* RED-PEN. Using static variables to pass function arguments
2458          * cannot be good idea...
2459          */
2460         tp->saw_tstamp = 0;
2461
2462         /*      pred_flags is 0xS?10 << 16 + snd_wnd
2463          *      if header_predition is to be made
2464          *      'S' will always be tp->tcp_header_len >> 2
2465          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2466          *  turn it off (when there are holes in the receive
2467          *       space for instance)
2468          *      PSH flag is ignored.
2469          */
2470
2471         if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2472                 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2473                 int tcp_header_len = tp->tcp_header_len;
2474
2475                 /* Timestamp header prediction: tcp_header_len
2476                  * is automatically equal to th->doff*4 due to pred_flags
2477                  * match.
2478                  */
2479
2480                 /* Check timestamp */
2481                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2482                         __u32 *ptr = (__u32 *)(th + 1);
2483
2484                         /* No? Slow path! */
2485                         if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2486                                                      | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2487                                 goto slow_path;
2488
2489                         tp->saw_tstamp = 1;
2490                         ++ptr;
2491                         tp->rcv_tsval = ntohl(*ptr);
2492                         ++ptr;
2493                         tp->rcv_tsecr = ntohl(*ptr);
2494
2495                         /* If PAWS failed, check it more carefully in slow path */
2496                         if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2497                                 goto slow_path;
2498
2499                         /* Predicted packet is in window by definition.
2500                          * seq == rcv_nxt and rcv_wup <= rcv_nxt.
2501                          * Hence, check seq<=rcv_wup reduces to:
2502                          */
2503                         if (tp->rcv_nxt == tp->rcv_wup) {
2504                                 tp->ts_recent = tp->rcv_tsval;
2505                                 tp->ts_recent_stamp = xtime.tv_sec;
2506                         }
2507                 }
2508
2509                 if (len <= tcp_header_len) {
2510                         /* Bulk data transfer: sender */
2511                         if (len == tcp_header_len) {
2512                                 /* We know that such packets are checksummed
2513                                  * on entry.
2514                                  */
2515                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2516                                         TCP_SKB_CB(skb)->ack_seq, len);
2517                                 kfree_skb(skb);
2518                                 tcp_data_snd_check(sk);
2519                                 if (tp->sorry)
2520                                         tcp_new_space(sk);
2521                                 return 0;
2522                         } else { /* Header too small */
2523                                 TCP_INC_STATS_BH(TcpInErrs);
2524                                 goto discard;
2525                         }
2526                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
2527                         int eaten = 0;
2528
2529                         if (tp->ucopy.task == current &&
2530                             tp->copied_seq == tp->rcv_nxt &&
2531                             len - tcp_header_len <= tp->ucopy.len &&
2532                             sk->lock.users) {
2533                                 eaten = 1;
2534
2535                                 NET_INC_STATS_BH(TCPHPHitsToUser);
2536
2537                                 __set_current_state(TASK_RUNNING);
2538
2539                                 if (tcp_copy_to_iovec(sk, skb, tcp_header_len))
2540                                         goto csum_error;
2541
2542                                 __skb_pull(skb,tcp_header_len);
2543
2544                                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2545                         } else {
2546                                 if (tcp_checksum_complete_user(sk, skb))
2547                                         goto csum_error;
2548
2549                                 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
2550                                         goto step5;
2551
2552                                 NET_INC_STATS_BH(TCPHPHits);
2553
2554                                 /* Bulk data transfer: receiver */
2555                                 __skb_pull(skb,tcp_header_len);
2556
2557                                 /* DO NOT notify forward progress here.
2558                                  * It saves dozen of CPU instructions in fast path. --ANK
2559                                  * And where is it signaled then ? -AK
2560                                  * Nowhere. 8) --ANK
2561                                  */
2562                                 __skb_queue_tail(&sk->receive_queue, skb);
2563                                 skb_set_owner_r(skb, sk);
2564
2565                                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2566
2567                                 /* FIN bit check is not done since if FIN is set in
2568                                  * this frame, the pred_flags won't match up. -DaveM
2569                                  */
2570                                 sk->data_ready(sk, 0);
2571                         }
2572
2573                         tcp_event_data_recv(tp, skb);
2574
2575 #ifdef TCP_MORE_COARSE_ACKS
2576                         if (eaten) {
2577                                 if (tcp_in_quickack_mode(tp)) {
2578                                         tcp_send_ack(sk);
2579                                 } else {
2580                                         tcp_send_delayed_ack(sk);
2581                                 }
2582                         } else
2583 #endif
2584                         __tcp_ack_snd_check(sk, 0);
2585
2586                         if (eaten)
2587                                 kfree_skb(skb);
2588                         return 0;
2589                 }
2590                 /* Packet is in sequence, flags are trivial;
2591                  * only ACK is strange. Jump to step 5.
2592                  */
2593                 if (tcp_checksum_complete_user(sk, skb))
2594                         goto csum_error;
2595                 goto step5;
2596         }
2597
2598 slow_path:
2599         if (tcp_checksum_complete_user(sk, skb))
2600                 goto csum_error;
2601
2602         /*
2603          * RFC1323: H1. Apply PAWS check first.
2604          */
2605         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2606             tcp_paws_discard(tp, skb)) {
2607                 if (!th->rst) {
2608                         NET_INC_STATS_BH(PAWSEstabRejected);
2609                         tcp_send_ack(sk);
2610                         goto discard;
2611                 }
2612                 /* Resets are accepted even if PAWS failed.
2613
2614                    ts_recent update must be made after we are sure
2615                    that the packet is in window.
2616                  */
2617         }
2618
2619         /*
2620          *      Standard slow path.
2621          */
2622
2623         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2624                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2625                  * (RST) segments are validated by checking their SEQ-fields."
2626                  * And page 69: "If an incoming segment is not acceptable,
2627                  * an acknowledgment should be sent in reply (unless the RST bit
2628                  * is set, if so drop the segment and return)".
2629                  */
2630                 if (th->rst)
2631                         goto discard;
2632                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2633                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2634                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2635                                    tp->rcv_wup, tp->rcv_wnd);
2636                 }
2637                 tcp_enter_quickack_mode(tp);
2638                 tcp_send_ack(sk);
2639                 NET_INC_STATS_BH(DelayedACKLost);
2640                 goto discard;
2641         }
2642
2643         if(th->rst) {
2644                 tcp_reset(sk);
2645                 goto discard;
2646         }
2647
2648         if (tp->saw_tstamp) {
2649                 tcp_replace_ts_recent(sk, tp,
2650                                       TCP_SKB_CB(skb)->seq);
2651         }
2652
2653         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2654                 SOCK_DEBUG(sk, "syn in established state\n");
2655                 TCP_INC_STATS_BH(TcpInErrs);
2656                 tcp_reset(sk);
2657                 return 1;
2658         }
2659
2660 step5:
2661         if(th->ack)
2662                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2663
2664         /* Process urgent data. */
2665         tcp_urg(sk, th, len);
2666
2667         /* step 7: process the segment text */
2668         tcp_data(skb, sk, len);
2669
2670         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2671         if(sk->state != TCP_CLOSE) {
2672                 tcp_data_snd_check(sk);
2673                 tcp_ack_snd_check(sk);
2674                 if (tp->sorry)
2675                         tcp_new_space(sk);
2676         }
2677
2678         return 0;
2679
2680 csum_error:
2681         TCP_INC_STATS_BH(TcpInErrs);
2682
2683 discard:
2684         kfree_skb(skb);
2685         return 0;
2686 }
2687
2688
2689 /* This is not only more efficient than what we used to do, it eliminates
2690  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2691  *
2692  * Actually, we could lots of memory writes here. tp of listening
2693  * socket contains all necessary default parameters.
2694  */
2695 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2696 {
2697         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2698
2699         if(newsk != NULL) {
2700                 struct tcp_opt *newtp;
2701 #ifdef CONFIG_FILTER
2702                 struct sk_filter *filter;
2703 #endif
2704
2705                 memcpy(newsk, sk, sizeof(*newsk));
2706                 newsk->state = TCP_SYN_RECV;
2707
2708                 /* SANITY */
2709                 newsk->pprev = NULL;
2710                 newsk->prev = NULL;
2711
2712                 /* Clone the TCP header template */
2713                 newsk->dport = req->rmt_port;
2714
2715                 sock_lock_init(newsk);
2716                 bh_lock_sock(newsk);
2717
2718                 atomic_set(&newsk->rmem_alloc, 0);
2719                 skb_queue_head_init(&newsk->receive_queue);
2720                 atomic_set(&newsk->wmem_alloc, 0);
2721                 skb_queue_head_init(&newsk->write_queue);
2722                 atomic_set(&newsk->omem_alloc, 0);
2723
2724                 newsk->done = 0;
2725                 newsk->proc = 0;
2726                 newsk->backlog.head = newsk->backlog.tail = NULL;
2727                 skb_queue_head_init(&newsk->error_queue);
2728                 newsk->write_space = tcp_write_space;
2729 #ifdef CONFIG_FILTER
2730                 if ((filter = newsk->filter) != NULL)
2731                         sk_filter_charge(newsk, filter);
2732 #endif
2733
2734                 /* Now setup tcp_opt */
2735                 newtp = &(newsk->tp_pinfo.af_tcp);
2736                 newtp->pred_flags = 0;
2737                 newtp->rcv_nxt = req->rcv_isn + 1;
2738                 newtp->snd_nxt = req->snt_isn + 1;
2739                 newtp->snd_una = req->snt_isn + 1;
2740                 newtp->snd_sml = req->snt_isn + 1;
2741
2742                 tcp_delack_init(newtp);
2743                 if (skb->len >= 536)
2744                         newtp->ack.last_seg_size = skb->len;
2745
2746                 tcp_prequeue_init(newtp);
2747
2748                 newtp->snd_wl1 = req->rcv_isn;
2749                 newtp->snd_wl2 = req->snt_isn;
2750
2751                 newtp->retransmits = 0;
2752                 newtp->backoff = 0;
2753                 newtp->srtt = 0;
2754                 newtp->mdev = TCP_TIMEOUT_INIT;
2755                 newtp->rto = TCP_TIMEOUT_INIT;
2756
2757                 newtp->packets_out = 0;
2758                 newtp->fackets_out = 0;
2759                 newtp->retrans_out = 0;
2760                 newtp->snd_ssthresh = 0x7fffffff;
2761
2762                 /* So many TCP implementations out there (incorrectly) count the
2763                  * initial SYN frame in their delayed-ACK and congestion control
2764                  * algorithms that we must have the following bandaid to talk
2765                  * efficiently to them.  -DaveM
2766                  */
2767                 newtp->snd_cwnd = 2;
2768                 newtp->snd_cwnd_cnt = 0;
2769                 newtp->high_seq = 0;
2770
2771                 newtp->dup_acks = 0;
2772                 tcp_init_xmit_timers(newsk);
2773                 skb_queue_head_init(&newtp->out_of_order_queue);
2774                 newtp->send_head = newtp->retrans_head = NULL;
2775                 newtp->rcv_wup = req->rcv_isn + 1;
2776                 newtp->write_seq = req->snt_isn + 1;
2777                 newtp->copied_seq = req->rcv_isn + 1;
2778
2779                 newtp->saw_tstamp = 0;
2780
2781                 newtp->probes_out = 0;
2782                 newtp->num_sacks = 0;
2783                 newtp->syn_seq = req->rcv_isn;
2784                 newtp->fin_seq = req->rcv_isn;
2785                 newtp->urg_data = 0;
2786                 newtp->listen_opt = NULL;
2787                 newtp->accept_queue = newtp->accept_queue_tail = NULL;
2788                 /* Deinitialize syn_wait_lock to trap illegal accesses. */
2789                 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
2790
2791                 /* Back to base struct sock members. */
2792                 newsk->err = 0;
2793                 newsk->priority = 0;
2794                 atomic_set(&newsk->refcnt, 1);
2795 #ifdef INET_REFCNT_DEBUG
2796                 atomic_inc(&inet_sock_nr);
2797 #endif
2798
2799                 if (newsk->keepopen)
2800                         tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2801                 newsk->socket = NULL;
2802                 newsk->sleep = NULL;
2803
2804                 newtp->tstamp_ok = req->tstamp_ok;
2805                 if((newtp->sack_ok = req->sack_ok) != 0)
2806                         newtp->num_sacks = 0;
2807                 newtp->window_clamp = req->window_clamp;
2808                 newtp->rcv_wnd = req->rcv_wnd;
2809                 newtp->wscale_ok = req->wscale_ok;
2810                 if (newtp->wscale_ok) {
2811                         newtp->snd_wscale = req->snd_wscale;
2812                         newtp->rcv_wscale = req->rcv_wscale;
2813                 } else {
2814                         newtp->snd_wscale = newtp->rcv_wscale = 0;
2815                         newtp->window_clamp = min(newtp->window_clamp,65535);
2816                 }
2817                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
2818                 newtp->max_window = newtp->snd_wnd;
2819
2820                 if (newtp->tstamp_ok) {
2821                         newtp->ts_recent = req->ts_recent;
2822                         newtp->ts_recent_stamp = xtime.tv_sec;
2823                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2824                 } else {
2825                         newtp->ts_recent_stamp = 0;
2826                         newtp->tcp_header_len = sizeof(struct tcphdr);
2827                 }
2828                 newtp->mss_clamp = req->mss;
2829         }
2830         return newsk;
2831 }
2832
2833 /*
2834  *      Process an incoming packet for SYN_RECV sockets represented
2835  *      as an open_request.
2836  */
2837
2838 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2839                            struct open_request *req,
2840                            struct open_request **prev)
2841 {
2842         struct tcphdr *th = skb->h.th;
2843         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2844         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2845         int paws_reject = 0;
2846         struct tcp_opt ttp;
2847         struct sock *child;
2848
2849         ttp.saw_tstamp = 0;
2850         if (th->doff > (sizeof(struct tcphdr)>>2)) {
2851                 tcp_parse_options(NULL, th, &ttp, 0);
2852
2853                 if (ttp.saw_tstamp) {
2854                         ttp.ts_recent = req->ts_recent;
2855                         /* We do not store true stamp, but it is not required,
2856                          * it can be estimated (approximately)
2857                          * from another data.
2858                          */
2859                         ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
2860                         paws_reject = tcp_paws_check(&ttp, th->rst);
2861                 }
2862         }
2863
2864         /* Check for pure retransmited SYN. */
2865         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2866             flg == TCP_FLAG_SYN &&
2867             !paws_reject) {
2868                 /*
2869                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2870                  * this case on figure 6 and figure 8, but formal
2871                  * protocol description says NOTHING.
2872                  * To be more exact, it says that we should send ACK,
2873                  * because this segment (at least, if it has no data)
2874                  * is out of window.
2875                  *
2876                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2877                  *  describe SYN-RECV state. All the description
2878                  *  is wrong, we cannot believe to it and should
2879                  *  rely only on common sense and implementation
2880                  *  experience.
2881                  *
2882                  * Enforce "SYN-ACK" according to figure 8, figure 6
2883                  * of RFC793, fixed by RFC1122.
2884                  */
2885                 req->class->rtx_syn_ack(sk, req, NULL);
2886                 return NULL;
2887         }
2888
2889         /* Further reproduces section "SEGMENT ARRIVES"
2890            for state SYN-RECEIVED of RFC793.
2891            It is broken, however, it does not work only
2892            when SYNs are crossed, which is impossible in our
2893            case.
2894
2895            But generally, we should (RFC lies!) to accept ACK
2896            from SYNACK both here and in tcp_rcv_state_process().
2897            tcp_rcv_state_process() does not, hence, we do not too.
2898
2899            Note that the case is absolutely generic:
2900            we cannot optimize anything here without
2901            violating protocol. All the checks must be made
2902            before attempt to create socket.
2903          */
2904
2905         /* RFC793: "first check sequence number". */
2906
2907         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2908                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2909                 /* Out of window: send ACK and drop. */
2910                 if (!(flg & TCP_FLAG_RST))
2911                         req->class->send_ack(skb, req);
2912                 if (paws_reject)
2913                         NET_INC_STATS_BH(PAWSEstabRejected);
2914                 return NULL;
2915         }
2916
2917         /* In sequence, PAWS is OK. */
2918
2919         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2920                 req->ts_recent = ttp.rcv_tsval;
2921
2922         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2923                 /* Truncate SYN, it is out of window starting
2924                    at req->rcv_isn+1. */
2925                 flg &= ~TCP_FLAG_SYN;
2926         }
2927
2928         /* RFC793: "second check the RST bit" and
2929          *         "fourth, check the SYN bit"
2930          */
2931         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2932                 goto embryonic_reset;
2933
2934         /* RFC793: "fifth check the ACK field" */
2935
2936         if (!(flg & TCP_FLAG_ACK))
2937                 return NULL;
2938
2939         /* Invalid ACK: reset will be sent by listening socket */
2940         if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2941                 return sk;
2942         /* Also, it would be not so bad idea to check rcv_tsecr, which
2943          * is essentially ACK extension and too early or too late values
2944          * should cause reset in unsynchronized states.
2945          */
2946
2947         /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
2948         if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
2949                 req->acked = 1;
2950                 return NULL;
2951         }
2952
2953         /* OK, ACK is valid, create big socket and
2954          * feed this segment to it. It will repeat all
2955          * the tests. THIS SEGMENT MUST MOVE SOCKET TO
2956          * ESTABLISHED STATE. If it will be dropped after
2957          * socket is created, wait for troubles.
2958          */
2959         child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2960         if (child == NULL)
2961                 goto listen_overflow;
2962
2963         tcp_synq_unlink(tp, req, prev);
2964         tcp_synq_removed(sk, req);
2965
2966         tcp_acceptq_queue(sk, req, child);
2967         return child;
2968
2969 listen_overflow:
2970         if (!sysctl_tcp_abort_on_overflow) {
2971                 req->acked = 1;
2972                 return NULL;
2973         }
2974
2975 embryonic_reset:
2976         NET_INC_STATS_BH(EmbryonicRsts);
2977         if (!(flg & TCP_FLAG_RST))
2978                 req->class->send_reset(skb);
2979
2980         tcp_synq_drop(sk, req, prev);
2981         return NULL;
2982 }
2983
2984 /*
2985  * Queue segment on the new socket if the new socket is active,
2986  * otherwise we just shortcircuit this and continue with
2987  * the new socket.
2988  */
2989
2990 int tcp_child_process(struct sock *parent, struct sock *child,
2991                       struct sk_buff *skb)
2992 {
2993         int ret = 0;
2994         int state = child->state;
2995
2996         if (child->lock.users == 0) {
2997                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
2998
2999                 /* Wakeup parent, send SIGIO */
3000                 if (state == TCP_SYN_RECV && child->state != state)
3001                         parent->data_ready(parent, 0);
3002         } else {
3003                 /* Alas, it is possible again, because we do lookup
3004                  * in main socket hash table and lock on listening
3005                  * socket does not protect us more.
3006                  */
3007                 sk_add_backlog(child, skb);
3008         }
3009
3010         bh_unlock_sock(child);
3011         return ret;
3012 }
3013
3014 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3015                                          struct tcphdr *th, unsigned len)
3016 {
3017         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
3018
3019         tcp_parse_options(sk, th, tp, 0);
3020
3021         if (th->ack) {
3022                 /* rfc793:
3023                  * "If the state is SYN-SENT then
3024                  *    first check the ACK bit
3025                  *      If the ACK bit is set
3026                  *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
3027                  *        a reset (unless the RST bit is set, if so drop
3028                  *        the segment and return)"
3029                  *
3030                  *  I cite this place to emphasize one essential
3031                  *  detail, this check is different of one
3032                  *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
3033                  *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
3034                  *  because we have no previous data sent before SYN.
3035                  *                                        --ANK(990513)
3036                  *
3037                  *  We do not send data with SYN, so that RFC-correct
3038                  *  test reduces to:
3039                  */
3040                 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
3041                         return 1;
3042
3043                 /* Check not from any RFC, but it is evident consequence
3044                  * of combining PAWS and usual SYN-SENT logic: ACK _is_
3045                  * checked in SYN-SENT unlike another states, hence
3046                  * echoed tstamp must be checked too.
3047                  */
3048                 if (tp->saw_tstamp) {
3049                         if (tp->rcv_tsecr == 0) {
3050                                 /* Workaround for bug in linux-2.1 and early
3051                                  * 2.2 kernels. Let's pretend that we did not
3052                                  * see such timestamp to avoid bogus rtt value,
3053                                  * calculated by tcp_ack().
3054                                  */
3055                                 tp->saw_tstamp = 0;
3056
3057                                 /* But do not forget to store peer's timestamp! */
3058                                 if (th->syn) {
3059                                         tp->ts_recent = tp->rcv_tsval;
3060                                         tp->ts_recent_stamp = xtime.tv_sec;
3061                                 }
3062                         } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 ||
3063                                    (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) {
3064                                 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n"));
3065                                 NET_INC_STATS_BH(PAWSActiveRejected);
3066                                 return 1;
3067                         }
3068                 }
3069
3070                 /* Now ACK is acceptable.
3071                  *
3072                  * "If the RST bit is set
3073                  *    If the ACK was acceptable then signal the user "error:
3074                  *    connection reset", drop the segment, enter CLOSED state,
3075                  *    delete TCB, and return."
3076                  */
3077
3078                 if (th->rst) {
3079                         tcp_reset(sk);
3080                         goto discard;
3081                 }
3082
3083                 /* rfc793:
3084                  *   "fifth, if neither of the SYN or RST bits is set then
3085                  *    drop the segment and return."
3086                  *
3087                  *    See note below!
3088                  *                                        --ANK(990513)
3089                  */
3090                 if (!th->syn)
3091                         goto discard;
3092
3093                 /* rfc793:
3094                  *   "If the SYN bit is on ...
3095                  *    are acceptable then ...
3096                  *    (our SYN has been ACKed), change the connection
3097                  *    state to ESTABLISHED..."
3098                  *
3099                  * Do you see? SYN-less ACKs in SYN-SENT state are
3100                  * completely ignored.
3101                  *
3102                  * The bug causing stalled SYN-SENT sockets
3103                  * was here: tcp_ack advanced snd_una and canceled
3104                  * retransmit timer, so that bare ACK received
3105                  * in SYN-SENT state (even with invalid ack==ISS,
3106                  * because tcp_ack check is too weak for SYN-SENT)
3107                  * causes moving socket to invalid semi-SYN-SENT,
3108                  * semi-ESTABLISHED state and connection hangs.
3109                  *                                     --ANK (990514)
3110                  *
3111                  * Bare ACK is valid, however.
3112                  * Actually, RFC793 requires to send such ACK
3113                  * in reply to any out of window packet.
3114                  * It is wrong, but Linux also send such
3115                  * useless ACKs sometimes.
3116                  *                                     --ANK (990724)
3117                  */
3118
3119                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3120                 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
3121                         TCP_SKB_CB(skb)->ack_seq, len);
3122
3123                 /* Ok.. it's good. Set up sequence numbers and
3124                  * move to established.
3125                  */
3126                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
3127                 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
3128
3129                 /* RFC1323: The window in SYN & SYN/ACK segments is
3130                  * never scaled.
3131                  */
3132                 tp->snd_wnd = ntohs(th->window);
3133                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3134                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3135                 tp->fin_seq = TCP_SKB_CB(skb)->seq;
3136
3137                 tcp_set_state(sk, TCP_ESTABLISHED);
3138
3139                 if (tp->wscale_ok == 0) {
3140                         tp->snd_wscale = tp->rcv_wscale = 0;
3141                         tp->window_clamp = min(tp->window_clamp,65535);
3142                 }
3143
3144                 if (tp->tstamp_ok) {
3145                         tp->tcp_header_len =
3146                                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
3147                 } else
3148                         tp->tcp_header_len = sizeof(struct tcphdr);
3149                 if (tp->saw_tstamp) {
3150                         tp->ts_recent = tp->rcv_tsval;
3151                         tp->ts_recent_stamp = xtime.tv_sec;
3152                 }
3153                 tcp_sync_mss(sk, tp->pmtu_cookie);
3154                 tcp_initialize_rcv_mss(sk);
3155                 tcp_init_metrics(sk);
3156                 tcp_init_buffer_space(sk);
3157
3158                 if (sk->keepopen)
3159                         tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
3160
3161                 tp->copied_seq = tp->rcv_nxt;
3162                 __tcp_fast_path_on(tp, tp->snd_wnd);
3163
3164                 if(!sk->dead) {
3165                         sk->state_change(sk);
3166                         sk_wake_async(sk, 0, POLL_OUT);
3167                 }
3168
3169                 if (tp->write_pending) {
3170                         /* Save one ACK. Data will be ready after
3171                          * several ticks, if write_pending is set.
3172                          *
3173                          * It may be deleted, but with this feature tcpdumps
3174                          * look so _wonderfully_ clever, that I was not able
3175                          * to stand against the temptation 8)     --ANK
3176                          */
3177                         tp->ack.pending = 1;
3178                         tp->ack.lrcvtime = tcp_time_stamp;
3179                         tcp_enter_quickack_mode(tp);
3180                         tp->ack.ato = TCP_ATO_MIN;
3181                         tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
3182                         goto discard;
3183                 } else {
3184                         tcp_send_ack(sk);
3185                 }
3186                 return -1;
3187         }
3188
3189         /* No ACK in the segment */
3190
3191         if (th->rst) {
3192                 /* rfc793:
3193                  * "If the RST bit is set
3194                  *
3195                  *      Otherwise (no ACK) drop the segment and return."
3196                  */
3197
3198                 goto discard;
3199         }
3200
3201         /* PAWS check. */
3202         if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0))
3203                 goto discard;
3204
3205         if (th->syn) {
3206                 /* We see SYN without ACK. It is attempt of
3207                  *  simultaneous connect with crossed SYNs.
3208                  *
3209                  * The previous version of the code
3210                  * checked for "connecting to self"
3211                  * here. that check is done now in
3212                  * tcp_connect.
3213                  *
3214                  * RED-PEN: BTW, it does not. 8)
3215                  */
3216                 tcp_set_state(sk, TCP_SYN_RECV);
3217                 if (tp->saw_tstamp) {
3218                         tp->ts_recent = tp->rcv_tsval;
3219                         tp->ts_recent_stamp = xtime.tv_sec;
3220                 }
3221
3222                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
3223                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
3224
3225                 /* RFC1323: The window in SYN & SYN/ACK segments is
3226                  * never scaled.
3227                  */
3228                 tp->snd_wnd = ntohs(th->window);
3229                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3230                 tp->max_window = tp->snd_wnd;
3231
3232                 tcp_sync_mss(sk, tp->pmtu_cookie);
3233                 tcp_initialize_rcv_mss(sk);
3234
3235                 tcp_send_synack(sk);
3236 #if 0
3237                 /* Note, we could accept data and URG from this segment.
3238                  * There are no obstacles to make this.
3239                  *
3240                  * However, if we ignore data in ACKless segments sometimes,
3241                  * we have no reasons to accept it sometimes.
3242                  * Also, seems the code doing it in step6 of tcp_rcv_state_process
3243                  * is not flawless. So, discard packet for sanity.
3244                  * Uncomment this return to process the data.
3245                  */
3246                 return -1;
3247 #endif
3248         }
3249         /* "fifth, if neither of the SYN or RST bits is set then
3250          * drop the segment and return."
3251          */
3252
3253 discard:
3254         kfree_skb(skb);
3255         return 0;
3256 }
3257
3258
3259 /*
3260  *      This function implements the receiving procedure of RFC 793 for
3261  *      all states except ESTABLISHED and TIME_WAIT.
3262  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
3263  *      address independent.
3264  */
3265
3266 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
3267                           struct tcphdr *th, unsigned len)
3268 {
3269         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
3270         int queued = 0;
3271
3272         tp->saw_tstamp = 0;
3273
3274         switch (sk->state) {
3275         case TCP_CLOSE:
3276                 /* When state == CLOSED, hash lookup always fails.
3277                  *
3278                  * But, there is a back door, the backlog queue.
3279                  * If we have a sequence of packets in the backlog
3280                  * during __release_sock() which have a sequence such
3281                  * that:
3282                  *      packet X        causes entry to TCP_CLOSE state
3283                  *      ...
3284                  *      packet X + N    has FIN bit set
3285                  *
3286                  * We report a (luckily) harmless error in this case.
3287                  * The issue is that backlog queue processing bypasses
3288                  * any hash lookups (we know which socket packets are for).
3289                  * The correct behavior here is what 2.0.x did, since
3290                  * a TCP_CLOSE socket does not exist.  Drop the frame
3291                  * and send a RST back to the other end.
3292                  */
3293
3294                 /* 1. The socket may be moved to TIME-WAIT state.
3295                    2. While this socket was locked, another socket
3296                       with the same identity could be created.
3297                    3. To continue?
3298
3299                    CONCLUSION: discard and only discard!
3300
3301                    Alternative would be relookup and recurse into tcp_v?_rcv
3302                    (not *_do_rcv) to work with timewait and listen states
3303                    correctly.
3304                  */
3305                 goto discard;
3306
3307         case TCP_LISTEN:
3308                 if(th->ack)
3309                         return 1;
3310
3311                 if(th->syn) {
3312                         if(tp->af_specific->conn_request(sk, skb) < 0)
3313                                 return 1;
3314
3315                         /* Now we have several options: In theory there is
3316                          * nothing else in the frame. KA9Q has an option to
3317                          * send data with the syn, BSD accepts data with the
3318                          * syn up to the [to be] advertised window and
3319                          * Solaris 2.1 gives you a protocol error. For now
3320                          * we just ignore it, that fits the spec precisely
3321                          * and avoids incompatibilities. It would be nice in
3322                          * future to drop through and process the data.
3323                          *
3324                          * Now that TTCP is starting to be used we ought to
3325                          * queue this data.
3326                          * But, this leaves one open to an easy denial of
3327                          * service attack, and SYN cookies can't defend
3328                          * against this problem. So, we drop the data
3329                          * in the interest of security over speed.
3330                          */
3331                         goto discard;
3332                 }
3333                 goto discard;
3334
3335         case TCP_SYN_SENT:
3336                 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
3337                 if (queued >= 0)
3338                         return queued;
3339                 queued = 0;
3340                 goto step6;
3341         }
3342
3343         /*   Parse the tcp_options present on this header.
3344          *   By this point we really only expect timestamps.
3345          *   Note that this really has to be here and not later for PAWS
3346          *   (RFC1323) to work.
3347          */
3348         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
3349             tcp_paws_discard(tp, skb)) {
3350                 if (!th->rst) {
3351                         tcp_send_ack(sk);
3352                         goto discard;
3353                 }
3354                 /* Reset is accepted even if it did not pass PAWS. */
3355         }
3356
3357         /* The silly FIN test here is necessary to see an advancing ACK in
3358          * retransmitted FIN frames properly.  Consider the following sequence:
3359          *
3360          *      host1 --> host2         FIN XSEQ:XSEQ(0) ack YSEQ
3361          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ
3362          *      host1 --> host2         XSEQ:XSEQ(0) ack YSEQ+1
3363          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ+1     (fails tcp_sequence test)
3364          *
3365          * At this point the connection will deadlock with host1 believing
3366          * that his FIN is never ACK'd, and thus it will retransmit it's FIN
3367          * forever.  The following fix is from Taral (taral@taral.net).
3368          *
3369          * RED-PEN. Seems, the above is not true.
3370          * If at least one end is RFC compliant, it will send ACK to
3371          * out of window FIN and, hence, move peer to TIME-WAIT.
3372          * I comment out this line. --ANK
3373          *
3374          * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
3375          * received in SYN-RECV. The problem is that description of
3376          * segment processing in SYN-RECV state in RFC792 is WRONG.
3377          * Correct check would accept ACK from this SYN-ACK, see
3378          * figures 6 and 8 (fixed by RFC1122). Compare this
3379          * to problem with FIN, they smell similarly. --ANK
3380          */
3381
3382         /* step 1: check sequence number */
3383         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
3384 #if 0
3385             && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
3386 #endif
3387             ) {
3388                 if (!th->rst) {
3389                         NET_INC_STATS_BH(DelayedACKLost);
3390                         tcp_enter_quickack_mode(tp);
3391                         tcp_send_ack(sk);
3392                 }
3393                 goto discard;
3394         }
3395
3396         /* step 2: check RST bit */
3397         if(th->rst) {
3398                 tcp_reset(sk);
3399                 goto discard;
3400         }
3401
3402         if (tp->saw_tstamp) {
3403                 tcp_replace_ts_recent(sk, tp,
3404                                       TCP_SKB_CB(skb)->seq);
3405         }
3406
3407         /* step 3: check security and precedence [ignored] */
3408
3409         /*      step 4:
3410          *
3411          *      Check for a SYN, and ensure it matches the SYN we were
3412          *      first sent. We have to handle the rather unusual (but valid)
3413          *      sequence that KA9Q derived products may generate of
3414          *
3415          *      SYN
3416          *                              SYN|ACK Data
3417          *      ACK     (lost)
3418          *                              SYN|ACK Data + More Data
3419          *      .. we must ACK not RST...
3420          *
3421          *      We keep syn_seq as the sequence space occupied by the
3422          *      original syn.
3423          */
3424
3425         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
3426                 tcp_reset(sk);
3427                 return 1;
3428         }
3429
3430         /* step 5: check the ACK field */
3431         if (th->ack) {
3432                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3433                                          TCP_SKB_CB(skb)->ack_seq, len);
3434
3435                 switch(sk->state) {
3436                 case TCP_SYN_RECV:
3437                         if (acceptable) {
3438                                 tcp_set_state(sk, TCP_ESTABLISHED);
3439                                 tp->copied_seq = tp->rcv_nxt;
3440
3441                                 /* Note, that this wakeup is only for marginal
3442                                  * crossed SYN case. Passively open sockets
3443                                  * are not waked up, because sk->sleep == NULL
3444                                  * and sk->socket == NULL.
3445                                  */
3446                                 if (!sk->dead) {
3447                                         sk->state_change(sk);
3448                                         sk_wake_async(sk,0,POLL_OUT);
3449                                 }
3450
3451                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3452                                 tp->snd_wnd = ntohs(th->window) << tp->snd_wscale;
3453                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3454                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3455
3456                                 /* tcp_ack considers this ACK as duplicate
3457                                  * and does not calculate rtt.
3458                                  * Fix it at least with timestamps.
3459                                  */
3460                                 if (tp->saw_tstamp && !tp->srtt)
3461                                         tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3462
3463                                 tcp_init_metrics(sk);
3464                                 tcp_fast_path_on(tp);
3465                         } else {
3466                                 SOCK_DEBUG(sk, "bad ack\n");
3467                                 return 1;
3468                         }
3469                         break;
3470
3471                 case TCP_FIN_WAIT1:
3472                         if (tp->snd_una == tp->write_seq) {
3473                                 tcp_set_state(sk, TCP_FIN_WAIT2);
3474                                 sk->shutdown |= SEND_SHUTDOWN;
3475                                 dst_confirm(sk->dst_cache);
3476
3477                                 if (!sk->dead) {
3478                                         /* Wake up lingering close() */
3479                                         sk->state_change(sk);
3480                                 } else {
3481                                         int tmo;
3482
3483                                         if (tp->linger2 < 0 ||
3484                                             (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3485                                              after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
3486                                                 tcp_done(sk);
3487                                                 return 1;
3488                                         }
3489
3490                                         tmo = tcp_fin_time(tp);
3491                                         if (tmo > TCP_TIMEWAIT_LEN) {
3492                                                 tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
3493                                         } else if (th->fin || sk->lock.users) {
3494                                                 /* Bad case. We could lose such FIN otherwise.
3495                                                  * It is not a big problem, but it looks confusing
3496                                                  * and not so rare event. We still can lose it now,
3497                                                  * if it spins in bh_lock_sock(), but it is really
3498                                                  * marginal case.
3499                                                  */
3500                                                 tcp_reset_keepalive_timer(sk, tmo);
3501                                         } else {
3502                                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
3503                                                 goto discard;
3504                                         }
3505                                 }
3506                         }
3507                         break;
3508
3509                 case TCP_CLOSING:
3510                         if (tp->snd_una == tp->write_seq) {
3511                                 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3512                                 goto discard;
3513                         }
3514                         break;
3515
3516                 case TCP_LAST_ACK:
3517                         if (tp->snd_una == tp->write_seq) {
3518                                 tcp_update_metrics(sk);
3519                                 tcp_done(sk);
3520                                 goto discard;
3521                         }
3522                         break;
3523                 }
3524         } else
3525                 goto discard;
3526
3527 step6:
3528         /* step 6: check the URG bit */
3529         tcp_urg(sk, th, len);
3530
3531         /* step 7: process the segment text */
3532         switch (sk->state) {
3533         case TCP_CLOSE_WAIT:
3534         case TCP_CLOSING:
3535                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3536                         break;
3537         case TCP_FIN_WAIT1:
3538         case TCP_FIN_WAIT2:
3539                 /* RFC 793 says to queue data in these states,
3540                  * RFC 1122 says we MUST send a reset.
3541                  * BSD 4.4 also does reset.
3542                  */
3543                 if (sk->shutdown & RCV_SHUTDOWN) {
3544                         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3545                             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3546                                 tcp_reset(sk);
3547                                 return 1;
3548                         }
3549                 }
3550                 /* Fall through */
3551         case TCP_ESTABLISHED:
3552                 tcp_data(skb, sk, len);
3553                 queued = 1;
3554                 break;
3555         }
3556
3557         /* tcp_data could move socket to TIME-WAIT */
3558         if (sk->state != TCP_CLOSE) {
3559                 tcp_data_snd_check(sk);
3560                 tcp_ack_snd_check(sk);
3561                 if (tp->sorry)
3562                         tcp_new_space(sk);
3563         }
3564
3565         if (!queued) {
3566 discard:
3567                 kfree_skb(skb);
3568         }
3569         return 0;
3570 }