net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.165 1999/05/14 23:10:08 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <linux/ipsec.h>
  65
  66 #ifdef CONFIG_SYSCTL
  67 #define SYNC_INIT 0 /* let the user enable it */
  68 #else
  69 #define SYNC_INIT 1
  70 #endif
  71
  72 extern int sysctl_tcp_fin_timeout;
  73
  74 /* These are on by default so the code paths get tested.
  75  * For the final 2.2 this may be undone at our discretion. -DaveM
  76  */
  77 int sysctl_tcp_timestamps = 1;
  78 int sysctl_tcp_window_scaling = 1;
  79 int sysctl_tcp_sack = 1;
  80
  81 int sysctl_tcp_syncookies = SYNC_INIT;
  82 int sysctl_tcp_stdurg;
  83 int sysctl_tcp_rfc1337;
  84
  85 static int prune_queue(struct sock *sk);
  86
  87 /* There is something which you must keep in mind when you analyze the
  88  * behavior of the tp->ato delayed ack timeout interval.  When a
  89  * connection starts up, we want to ack as quickly as possible.  The
  90  * problem is that "good" TCP's do slow start at the beginning of data
  91  * transmission.  The means that until we send the first few ACK's the
  92  * sender will sit on his end and only queue most of his data, because
  93  * he can only send snd_cwnd unacked packets at any given time.  For
  94  * each ACK we send, he increments snd_cwnd and transmits more of his
  95  * queue.  -DaveM
  96  */
  97 static void tcp_delack_estimator(struct tcp_opt *tp)
  98 {
  99         if(tp->ato == 0) {
 100                 tp->lrcvtime = tcp_time_stamp;
 101
 102                 /* Help sender leave slow start quickly,
 103                  * and also makes sure we do not take this
 104                  * branch ever again for this connection.
 105                  */
 106                 tp->ato = 1;
 107                 tcp_enter_quickack_mode(tp);
 108         } else {
 109                 int m = tcp_time_stamp - tp->lrcvtime;
 110
 111                 tp->lrcvtime = tcp_time_stamp;
 112                 if(m <= 0)
 113                         m = 1;
 114                 if(m > tp->rto)
 115                         tp->ato = tp->rto;
 116                 else {
 117                         /* This funny shift makes sure we
 118                          * clear the "quick ack mode" bit.
 119                          */
 120                         tp->ato = ((tp->ato << 1) >> 2) + m;
 121                 }
 122         }
 123 }
 124
 125 /*
 126  * Remember to send an ACK later.
 127  */
 128 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
 129                                         struct sk_buff *skb)
 130 {
 131         tp->delayed_acks++;
 132
 133         /* Tiny-grams with PSH set artifically deflate our
 134          * ato measurement, but with a lower bound.
 135          */
 136         if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
 137                 /* Preserve the quickack state. */
 138                 if((tp->ato & 0x7fffffff) > HZ/50)
 139                         tp->ato = ((tp->ato & 0x80000000) |
 140                                    (HZ/50));
 141         }
 142 }
 143
 144 /* Called to compute a smoothed rtt estimate. The data fed to this
 145  * routine either comes from timestamps, or from segments that were
 146  * known _not_ to have been retransmitted [see Karn/Partridge
 147  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 148  * piece by Van Jacobson.
 149  * NOTE: the next three routines used to be one big routine.
 150  * To save cycles in the RFC 1323 implementation it was better to break
 151  * it up into three procedures. -- erics
 152  */
 153
 154 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 155 {
 156         long m = mrtt; /* RTT */
 157
 158         /*      The following amusing code comes from Jacobson's
 159          *      article in SIGCOMM '88.  Note that rtt and mdev
 160          *      are scaled versions of rtt and mean deviation.
 161          *      This is designed to be as fast as possible
 162          *      m stands for "measurement".
 163          *
 164          *      On a 1990 paper the rto value is changed to:
 165          *      RTO = rtt + 4 * mdev
 166          */
 167         if(m == 0)
 168                 m = 1;
 169         if (tp->srtt != 0) {
 170                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 171                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 172                 if (m < 0)
 173                         m = -m;         /* m is now abs(error) */
 174                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 175                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 176         } else {
 177                 /* no previous measure. */
 178                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 179                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 180         }
 181 }
 182
 183 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 184  * routine referred to above.
 185  */
 186
 187 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 188 {
 189         tp->rto = (tp->srtt >> 3) + tp->mdev;
 190         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 191 }
 192
 193
 194 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 195  * on packet lifetime in the internet. We need the HZ/5 lower
 196  * bound to behave correctly against BSD stacks with a fixed
 197  * delayed ack.
 198  * FIXME: It's not entirely clear this lower bound is the best
 199  * way to avoid the problem. Is it possible to drop the lower
 200  * bound and still avoid trouble with BSD stacks? Perhaps
 201  * some modification to the RTO calculation that takes delayed
 202  * ack bias into account? This needs serious thought. -- erics
 203  */
 204 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 205 {
 206         if (tp->rto > 120*HZ)
 207                 tp->rto = 120*HZ;
 208         if (tp->rto < HZ/5)
 209                 tp->rto = HZ/5;
 210 }
 211
 212 /* WARNING: this must not be called if tp->saw_timestamp was false. */
 213 extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
 214                                              __u32 start_seq, __u32 end_seq)
 215 {
 216         /* From draft-ietf-tcplw-high-performance: the correct
 217          * test is last_ack_sent <= end_seq.
 218          * (RFC1323 stated last_ack_sent < end_seq.)
 219          *
 220          * HOWEVER: The current check contradicts the draft statements.
 221          * It has been done for good reasons.
 222          * The implemented check improves security and eliminates
 223          * unnecessary RTT overestimation.
 224          *              1998/06/27  Andrey V. Savochkin <saw@msu.ru>
 225          */
 226         if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
 227             !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
 228                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 229                  * extra check below makes sure this can only happen
 230                  * for pure ACK frames.  -DaveM
 231                  */
 232                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
 233                         tp->ts_recent = tp->rcv_tsval;
 234                         tp->ts_recent_stamp = tcp_time_stamp;
 235                 }
 236         }
 237 }
 238
 239 #define PAWS_24DAYS     (HZ * 60 * 60 * 24 * 24)
 240
 241 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 242 {
 243         /* ts_recent must be younger than 24 days */
 244         return (((s32)(tcp_time_stamp - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
 245                 (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) &&
 246                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
 247                  (len != (th->doff * 4))));
 248 }
 249
 250
 251 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 252 {
 253         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 254
 255         if (tp->rcv_wnd &&
 256             after(end_seq, tp->rcv_nxt) &&
 257             before(seq, end_window))
 258                 return 1;
 259         if (seq != end_window)
 260                 return 0;
 261         return (seq == end_seq);
 262 }
 263
 264 /* This functions checks to see if the tcp header is actually acceptable. */
 265 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 266 {
 267         if (seq == tp->rcv_nxt)
 268                 return (tp->rcv_wnd || (end_seq == seq));
 269
 270         return __tcp_sequence(tp, seq, end_seq);
 271 }
 272
 273 /* When we get a reset we do this. */
 274 static void tcp_reset(struct sock *sk)
 275 {
 276         sk->zapped = 1;
 277
 278         /* We want the right error as BSD sees it (and indeed as we do). */
 279         switch (sk->state) {
 280                 case TCP_SYN_SENT:
 281                         sk->err = ECONNREFUSED;
 282                         break;
 283                 case TCP_CLOSE_WAIT:
 284                         sk->err = EPIPE;
 285                         break;
 286                 default:
 287                         sk->err = ECONNRESET;
 288         };
 289         tcp_set_state(sk, TCP_CLOSE);
 290         sk->shutdown = SHUTDOWN_MASK;
 291         if (!sk->dead)
 292                 sk->state_change(sk);
 293 }
 294
 295 /* This tags the retransmission queue when SACKs arrive. */
 296 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 297 {
 298         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 299         int i = nsacks;
 300
 301         while(i--) {
 302                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 303                 __u32 start_seq = ntohl(sp->start_seq);
 304                 __u32 end_seq = ntohl(sp->end_seq);
 305                 int fack_count = 0;
 306
 307                 while((skb != NULL) &&
 308                       (skb != tp->send_head) &&
 309                       (skb != (struct sk_buff *)&sk->write_queue)) {
 310                         /* The retransmission queue is always in order, so
 311                          * we can short-circuit the walk early.
 312                          */
 313                         if(after(TCP_SKB_CB(skb)->seq, end_seq))
 314                                 break;
 315
 316                         /* We play conservative, we don't allow SACKS to partially
 317                          * tag a sequence space.
 318                          */
 319                         fack_count++;
 320                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 321                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 322                                 /* If this was a retransmitted frame, account for it. */
 323                                 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 324                                    tp->retrans_out)
 325                                         tp->retrans_out--;
 326                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 327
 328                                 /* RULE: All new SACKs will either decrease retrans_out
 329                                  *       or advance fackets_out.
 330                                  */
 331                                 if(fack_count > tp->fackets_out)
 332                                         tp->fackets_out = fack_count;
 333                         }
 334                         skb = skb->next;
 335                 }
 336                 sp++; /* Move on to the next SACK block. */
 337         }
 338 }
 339
 340 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 341  * But, this can also be called on packets in the established flow when
 342  * the fast version below fails.
 343  */
 344 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 345 {
 346         unsigned char *ptr;
 347         int length=(th->doff*4)-sizeof(struct tcphdr);
 348         int saw_mss = 0;
 349
 350         ptr = (unsigned char *)(th + 1);
 351         tp->saw_tstamp = 0;
 352
 353         while(length>0) {
 354                 int opcode=*ptr++;
 355                 int opsize;
 356
 357                 switch (opcode) {
 358                         case TCPOPT_EOL:
 359                                 return;
 360                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 361                                 length--;
 362                                 continue;
 363                         default:
 364                                 opsize=*ptr++;
 365                                 if (opsize < 2) /* "silly options" */
 366                                         return;
 367                                 if (opsize > length)
 368                                         break;  /* don't parse partial options */
 369                                 switch(opcode) {
 370                                 case TCPOPT_MSS:
 371                                         if(opsize==TCPOLEN_MSS && th->syn) {
 372                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 373                                                 if (in_mss == 0)
 374                                                         in_mss = 536;
 375                                                 if (tp->mss_clamp > in_mss)
 376                                                         tp->mss_clamp = in_mss;
 377                                                 saw_mss = 1;
 378                                         }
 379                                         break;
 380                                 case TCPOPT_WINDOW:
 381                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 382                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 383                                                         tp->wscale_ok = 1;
 384                                                         tp->snd_wscale = *(__u8 *)ptr;
 385                                                         if(tp->snd_wscale > 14) {
 386                                                                 if(net_ratelimit())
 387                                                                         printk("tcp_parse_options: Illegal window "
 388                                                                                "scaling value %d >14 received.",
 389                                                                                tp->snd_wscale);
 390                                                                 tp->snd_wscale = 14;
 391                                                         }
 392                                                 }
 393                                         break;
 394                                 case TCPOPT_TIMESTAMP:
 395                                         if(opsize==TCPOLEN_TIMESTAMP) {
 396                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 397                                                         tp->tstamp_ok = 1;
 398                                                         tp->saw_tstamp = 1;
 399                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 400                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 401                                                 }
 402                                         }
 403                                         break;
 404                                 case TCPOPT_SACK_PERM:
 405                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 406                                                 if (sysctl_tcp_sack && !no_fancy) {
 407                                                         tp->sack_ok = 1;
 408                                                         tp->num_sacks = 0;
 409                                                 }
 410                                         }
 411                                         break;
 412
 413                                 case TCPOPT_SACK:
 414                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 415                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 416                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 417
 418                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 419                                                         int num_sacks = sack_bytes >> 3;
 420                                                         struct tcp_sack_block *sackp;
 421
 422                                                         sackp = (struct tcp_sack_block *)ptr;
 423                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 424                                                 }
 425                                         }
 426                                 };
 427                                 ptr+=opsize-2;
 428                                 length-=opsize;
 429                 };
 430         }
 431         if(th->syn && saw_mss == 0)
 432                 tp->mss_clamp = 536;
 433 }
 434
 435 /* Fast parse options. This hopes to only see timestamps.
 436  * If it is wrong it falls back on tcp_parse_options().
 437  */
 438 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 439 {
 440         /* If we didn't send out any options ignore them all. */
 441         if (tp->tcp_header_len == sizeof(struct tcphdr))
 442                 return 0;
 443         if (th->doff == sizeof(struct tcphdr)>>2) {
 444                 tp->saw_tstamp = 0;
 445                 return 0;
 446         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 447                 __u32 *ptr = (__u32 *)(th + 1);
 448                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 449                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 450                         tp->saw_tstamp = 1;
 451                         tp->rcv_tsval = ntohl(*++ptr);
 452                         tp->rcv_tsecr = ntohl(*++ptr);
 453                         return 1;
 454                 }
 455         }
 456         tcp_parse_options(sk, th, tp, 0);
 457         return 1;
 458 }
 459
 460 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 461 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 462 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 463 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 464
 465 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 466 {
 467         if (tp->dup_acks > 3)
 468                 tp->snd_cwnd = (tp->snd_ssthresh);
 469
 470         tp->dup_acks = 0;
 471 }
 472
 473 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 474  * retransmit timer fires.
 475  */
 476 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 477 {
 478         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 479
 480         /* Note: If not_dup is set this implies we got a
 481          * data carrying packet or a window update.
 482          * This carries no new information about possible
 483          * lost packets, so we have to ignore it for the purposes
 484          * of counting duplicate acks. Ideally this does not imply we
 485          * should stop our fast retransmit phase, more acks may come
 486          * later without data to help us. Unfortunately this would make
 487          * the code below much more complex. For now if I see such
 488          * a packet I clear the fast retransmit phase.
 489          */
 490         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 491                 /* This is the standard reno style fast retransmit branch. */
 492
 493                 /* 1. When the third duplicate ack is received, set ssthresh
 494                  * to one half the current congestion window, but no less
 495                  * than two segments. Retransmit the missing segment.
 496                  */
 497                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 498                         tp->dup_acks++;
 499                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 500                                 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 501                                 tp->snd_cwnd = (tp->snd_ssthresh + 3);
 502                                 tp->high_seq = tp->snd_nxt;
 503                                 if(!tp->fackets_out)
 504                                         tcp_retransmit_skb(sk,
 505                                                            skb_peek(&sk->write_queue));
 506                                 else
 507                                         tcp_fack_retransmit(sk);
 508                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 509                         }
 510                 } else if (++tp->dup_acks > 3) {
 511                         /* 2. Each time another duplicate ACK arrives, increment
 512                          * cwnd by the segment size. [...] Transmit a packet...
 513                          *
 514                          * Packet transmission will be done on normal flow processing
 515                          * since we're not in "retransmit mode".  We do not use
 516                          * duplicate ACKs to artificially inflate the congestion
 517                          * window when doing FACK.
 518                          */
 519                         if(!tp->fackets_out) {
 520                                 tp->snd_cwnd++;
 521                         } else {
 522                                 /* Fill any further holes which may have
 523                                  * appeared.
 524                                  *
 525                                  * We may want to change this to run every
 526                                  * further multiple-of-3 dup ack increments,
 527                                  * to be more robust against out-of-order
 528                                  * packet delivery.  -DaveM
 529                                  */
 530                                 tcp_fack_retransmit(sk);
 531                         }
 532                 }
 533         } else if (tp->high_seq != 0) {
 534                 /* In this branch we deal with clearing the Floyd style
 535                  * block on duplicate fast retransmits, and if requested
 536                  * we do Hoe style secondary fast retransmits.
 537                  */
 538                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 539                         /* Once we have acked all the packets up to high_seq
 540                          * we are done this fast retransmit phase.
 541                          * Alternatively data arrived. In this case we
 542                          * Have to abort the fast retransmit attempt.
 543                          * Note that we do want to accept a window
 544                          * update since this is expected with Hoe's algorithm.
 545                          */
 546                         clear_fast_retransmit(tp);
 547
 548                         /* After we have cleared up to high_seq we can
 549                          * clear the Floyd style block.
 550                          */
 551                         if (!before(ack, tp->high_seq)) {
 552                                 tp->high_seq = 0;
 553                                 tp->fackets_out = 0;
 554                         }
 555                 } else if (tp->dup_acks >= 3) {
 556                         if (!tp->fackets_out) {
 557                                 /* Hoe Style. We didn't ack the whole
 558                                  * window. Take this as a cue that
 559                                  * another packet was lost and retransmit it.
 560                                  * Don't muck with the congestion window here.
 561                                  * Note that we have to be careful not to
 562                                  * act if this was a window update and it
 563                                  * didn't ack new data, since this does
 564                                  * not indicate a packet left the system.
 565                                  * We can test this by just checking
 566                                  * if ack changed from snd_una, since
 567                                  * the only way to get here without advancing
 568                                  * from snd_una is if this was a window update.
 569                                  */
 570                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 571                                         tcp_retransmit_skb(sk,
 572                                                            skb_peek(&sk->write_queue));
 573                                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 574                                 }
 575                         } else {
 576                                 /* FACK style, fill any remaining holes in
 577                                  * receiver's queue.
 578                                  */
 579                                 tcp_fack_retransmit(sk);
 580                         }
 581                 }
 582         }
 583 }
 584
 585 /* This is Jacobson's slow start and congestion avoidance.
 586  * SIGCOMM '88, p. 328.
 587  */
 588 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 589 {
 590         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 591                 /* In "safe" area, increase. */
 592                 tp->snd_cwnd++;
 593         } else {
 594                 /* In dangerous area, increase slowly.
 595                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 596                  */
 597                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 598                         tp->snd_cwnd++;
 599                         tp->snd_cwnd_cnt=0;
 600                 } else
 601                         tp->snd_cwnd_cnt++;
 602         }
 603 }
 604
 605 /* Remove acknowledged frames from the retransmission queue. */
 606 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 607                                __u32 *seq, __u32 *seq_rtt)
 608 {
 609         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 610         struct sk_buff *skb;
 611         __u32 now = tcp_time_stamp;
 612         int acked = 0;
 613
 614         /* If we are retransmitting, and this ACK clears up to
 615          * the retransmit head, or further, then clear our state.
 616          */
 617         if (tp->retrans_head != NULL &&
 618             !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 619                 tp->retrans_head = NULL;
 620
 621         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 622                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 623                 __u8 sacked = scb->sacked;
 624
 625                 /* If our packet is before the ack sequence we can
 626                  * discard it as it's confirmed to have arrived at
 627                  * the other end.
 628                  */
 629                 if (after(scb->end_seq, ack))
 630                         break;
 631
 632                 /* Initial outgoing SYN's get put onto the write_queue
 633                  * just like anything else we transmit.  It is not
 634                  * true data, and if we misinform our callers that
 635                  * this ACK acks real data, we will erroneously exit
 636                  * connection startup slow start one packet too
 637                  * quickly.  This is severely frowned upon behavior.
 638                  */
 639                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 640                         tp->retrans_out--;
 641                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 642                         acked |= FLAG_DATA_ACKED;
 643                         if(sacked & TCPCB_SACKED_RETRANS)
 644                                 acked |= FLAG_RETRANS_DATA_ACKED;
 645                         if(tp->fackets_out)
 646                                 tp->fackets_out--;
 647                 } else {
 648                         /* This is pure paranoia. */
 649                         tp->retrans_head = NULL;
 650                 }
 651                 tp->packets_out--;
 652                 *seq = scb->seq;
 653                 *seq_rtt = now - scb->when;
 654                 __skb_unlink(skb, skb->list);
 655                 kfree_skb(skb);
 656         }
 657         return acked;
 658 }
 659
 660 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 661 {
 662         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 663
 664         /* Our probe was answered. */
 665         tp->probes_out = 0;
 666
 667         /* Was it a usable window open? */
 668
 669         /* should always be non-null */
 670         if (tp->send_head != NULL &&
 671             !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 672                 tp->backoff = 0;
 673                 tp->pending = 0;
 674                 tcp_clear_xmit_timer(sk, TIME_PROBE0);
 675         } else {
 676                 tcp_reset_xmit_timer(sk, TIME_PROBE0,
 677                                      min(tp->rto << tp->backoff, 120*HZ));
 678         }
 679 }
 680
 681 /* Should we open up the congestion window? */
 682 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 683 {
 684         /* Data must have been acked. */
 685         if ((flag & FLAG_DATA_ACKED) == 0)
 686                 return 0;
 687
 688         /* Some of the data acked was retransmitted somehow? */
 689         if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 690                 /* We advance in all cases except during
 691                  * non-FACK fast retransmit/recovery.
 692                  */
 693                 if (tp->fackets_out != 0 ||
 694                     tp->retransmits != 0)
 695                         return 1;
 696
 697                 /* Non-FACK fast retransmit does it's own
 698                  * congestion window management, don't get
 699                  * in the way.
 700                  */
 701                 return 0;
 702         }
 703
 704         /* New non-retransmitted data acked, always advance.  */
 705         return 1;
 706 }
 707
 708 /* Read draft-ietf-tcplw-high-performance before mucking
 709  * with this code. (Superceeds RFC1323)
 710  */
 711 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 712                                u32 seq, u32 ack, int flag)
 713 {
 714         __u32 seq_rtt;
 715
 716         /* RTTM Rule: A TSecr value received in a segment is used to
 717          * update the averaged RTT measurement only if the segment
 718          * acknowledges some new data, i.e., only if it advances the
 719          * left edge of the send window.
 720          *
 721          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 722          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 723          */
 724         if (!(flag & FLAG_DATA_ACKED))
 725                 return;
 726
 727         seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
 728         tcp_rtt_estimator(tp, seq_rtt);
 729         if (tp->retransmits) {
 730                 if (tp->packets_out == 0) {
 731                         tp->retransmits = 0;
 732                         tp->fackets_out = 0;
 733                         tp->retrans_out = 0;
 734                         tp->backoff = 0;
 735                         tcp_set_rto(tp);
 736                 } else {
 737                         /* Still retransmitting, use backoff */
 738                         tcp_set_rto(tp);
 739                         tp->rto = tp->rto << tp->backoff;
 740                 }
 741         } else {
 742                 tcp_set_rto(tp);
 743         }
 744
 745         tcp_bound_rto(tp);
 746 }
 747
 748 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 749 {
 750         struct sk_buff *skb = skb_peek(&sk->write_queue);
 751
 752         /* Some data was ACK'd, if still retransmitting (due to a
 753          * timeout), resend more of the retransmit queue.  The
 754          * congestion window is handled properly by that code.
 755          */
 756         if (tp->retransmits) {
 757                 tcp_xmit_retransmit_queue(sk);
 758                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 759         } else {
 760                 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 761                 if ((__s32)when < 0)
 762                         when = 1;
 763                 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 764         }
 765 }
 766
 767 /* This routine deals with incoming acks, but not outgoing ones. */
 768 static int tcp_ack(struct sock *sk, struct tcphdr *th,
 769                    u32 ack_seq, u32 ack, int len)
 770 {
 771         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 772         int flag = 0;
 773         u32 seq = 0;
 774         u32 seq_rtt = 0;
 775
 776         if(sk->zapped)
 777                 return(1);      /* Dead, can't ack any more so why bother */
 778
 779         if (tp->pending == TIME_KEEPOPEN)
 780                 tp->probes_out = 0;
 781
 782         tp->rcv_tstamp = tcp_time_stamp;
 783
 784         /* If the ack is newer than sent or older than previous acks
 785          * then we can probably ignore it.
 786          */
 787         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 788                 goto uninteresting_ack;
 789
 790         /* If there is data set flag 1 */
 791         if (len != th->doff*4) {
 792                 flag |= FLAG_DATA;
 793                 tcp_delack_estimator(tp);
 794         }
 795
 796         /* Update our send window. */
 797
 798         /* This is the window update code as per RFC 793
 799          * snd_wl{1,2} are used to prevent unordered
 800          * segments from shrinking the window
 801          */
 802         if (before(tp->snd_wl1, ack_seq) ||
 803             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 804                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
 805
 806                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 807                         flag |= FLAG_WIN_UPDATE;
 808                         tp->snd_wnd = nwin;
 809
 810                         tp->snd_wl1 = ack_seq;
 811                         tp->snd_wl2 = ack;
 812
 813                         if (nwin > tp->max_window)
 814                                 tp->max_window = nwin;
 815                 }
 816         }
 817
 818         /* We passed data and got it acked, remove any soft error
 819          * log. Something worked...
 820          */
 821         sk->err_soft = 0;
 822
 823         /* If this ack opens up a zero window, clear backoff.  It was
 824          * being used to time the probes, and is probably far higher than
 825          * it needs to be for normal retransmission.
 826          */
 827         if (tp->pending == TIME_PROBE0)
 828                 tcp_ack_probe(sk, ack);
 829
 830         /* See if we can take anything off of the retransmit queue. */
 831         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 832
 833         /* We must do this here, before code below clears out important
 834          * state contained in tp->fackets_out and tp->retransmits.  -DaveM
 835          */
 836         if (should_advance_cwnd(tp, flag))
 837                 tcp_cong_avoid(tp);
 838
 839         /* If we have a timestamp, we always do rtt estimates. */
 840         if (tp->saw_tstamp) {
 841                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
 842         } else {
 843                 /* If we were retransmiting don't count rtt estimate. */
 844                 if (tp->retransmits) {
 845                         if (tp->packets_out == 0) {
 846                                 tp->retransmits = 0;
 847                                 tp->fackets_out = 0;
 848                                 tp->retrans_out = 0;
 849                         }
 850                 } else {
 851                         /* We don't have a timestamp. Can only use
 852                          * packets that are not retransmitted to determine
 853                          * rtt estimates. Also, we must not reset the
 854                          * backoff for rto until we get a non-retransmitted
 855                          * packet. This allows us to deal with a situation
 856                          * where the network delay has increased suddenly.
 857                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 858                          */
 859                         if (flag & FLAG_DATA_ACKED) {
 860                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
 861                                         tp->backoff = 0;
 862                                         tcp_rtt_estimator(tp, seq_rtt);
 863                                         tcp_set_rto(tp);
 864                                         tcp_bound_rto(tp);
 865                                 }
 866                         }
 867                 }
 868         }
 869
 870         if (tp->packets_out) {
 871                 if (flag & FLAG_DATA_ACKED)
 872                         tcp_ack_packets_out(sk, tp);
 873         } else {
 874                 tcp_clear_xmit_timer(sk, TIME_RETRANS);
 875         }
 876
 877         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
 878         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
 879             (tp->high_seq != 0)) {
 880                 tcp_fast_retrans(sk, ack, flag);
 881         } else {
 882                 /* Clear any aborted fast retransmit starts. */
 883                 tp->dup_acks = 0;
 884         }
 885         /* It is not a brain fart, I thought a bit now. 8)
 886          *
 887          * Forward progress is indicated, if:
 888          *   1. the ack acknowledges new data.
 889          *   2. or the ack is duplicate, but it is caused by new segment
 890          *      arrival. This case is filtered by:
 891          *      - it contains no data, syn or fin.
 892          *      - it does not update window.
 893          *   3. or new SACK. It is difficult to check, so that we ignore it.
 894          *
 895          * Forward progress is also indicated by arrival new data,
 896          * which was caused by window open from our side. This case is more
 897          * difficult and it is made (alas, incorrectly) in tcp_data_queue().
 898          *                                              --ANK (990513)
 899          */
 900         if (ack != tp->snd_una || (flag == 0 && !th->fin))
 901                 dst_confirm(sk->dst_cache);
 902
 903         /* Remember the highest ack received. */
 904         tp->snd_una = ack;
 905         return 1;
 906
 907 uninteresting_ack:
 908         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 909         return 0;
 910 }
 911
 912 /* New-style handling of TIME_WAIT sockets. */
 913 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
 914 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
 915 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
 916
 917 /* Must be called only from BH context. */
 918 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 919 {
 920         SOCKHASH_LOCK_WRITE_BH();
 921
 922         /* Unlink from various places. */
 923         if(tw->bind_next)
 924                 tw->bind_next->bind_pprev = tw->bind_pprev;
 925         *(tw->bind_pprev) = tw->bind_next;
 926         if(tw->tb->owners == NULL)
 927                 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 928
 929         if(tw->next)
 930                 tw->next->pprev = tw->pprev;
 931         *tw->pprev = tw->next;
 932
 933         /* We decremented the prot->inuse count when we entered TIME_WAIT
 934          * and the sock from which this came was destroyed.
 935          */
 936         tw->sklist_next->sklist_prev = tw->sklist_prev;
 937         tw->sklist_prev->sklist_next = tw->sklist_next;
 938
 939         SOCKHASH_UNLOCK_WRITE_BH();
 940
 941         /* Ok, now free it up. */
 942         kmem_cache_free(tcp_timewait_cachep, tw);
 943 }
 944
 945 /* We come here as a special case from the AF specific TCP input processing,
 946  * and the SKB has no owner.  Essentially handling this is very simple,
 947  * we just keep silently eating rx'd packets until none show up for the
 948  * entire timeout period.  The only special cases are for BSD TIME_WAIT
 949  * reconnects and SYN/RST bits being set in the TCP header.
 950  */
 951 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 952                                struct tcphdr *th, unsigned len)
 953 {
 954         /*      RFC 1122:
 955          *      "When a connection is [...] on TIME-WAIT state [...]
 956          *      [a TCP] MAY accept a new SYN from the remote TCP to
 957          *      reopen the connection directly, if it:
 958          *
 959          *      (1)  assigns its initial sequence number for the new
 960          *      connection to be larger than the largest sequence
 961          *      number it used on the previous connection incarnation,
 962          *      and
 963          *
 964          *      (2)  returns to TIME-WAIT state if the SYN turns out
 965          *      to be an old duplicate".
 966          */
 967         if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
 968                 struct sock *sk;
 969                 struct tcp_func *af_specific = tw->af_specific;
 970                 __u32 isn;
 971                 int ret;
 972
 973                 isn = tw->rcv_nxt + 128000;
 974                 if(isn == 0)
 975                         isn++;
 976                 tcp_tw_deschedule(tw);
 977                 tcp_timewait_kill(tw);
 978                 sk = af_specific->get_sock(skb, th);
 979                 if(sk == NULL ||
 980                    !ipsec_sk_policy(sk,skb))
 981                         return 0;
 982
 983                 bh_lock_sock(sk);
 984
 985                 /* Default is to discard the frame. */
 986                 ret = 0;
 987
 988                 if(sk->lock.users)
 989                         goto out_unlock;
 990
 991                 skb_set_owner_r(skb, sk);
 992                 af_specific = sk->tp_pinfo.af_tcp.af_specific;
 993
 994                 if(af_specific->conn_request(sk, skb, isn) < 0)
 995                         ret = 1; /* Toss a reset back. */
 996         out_unlock:
 997                 bh_unlock_sock(sk);
 998                 return ret;
 999         }
1000
1001         /* Check RST or SYN */
1002         if(th->rst || th->syn) {
1003                 /* This is TIME_WAIT assasination, in two flavors.
1004                  * Oh well... nobody has a sufficient solution to this
1005                  * protocol bug yet.
1006                  */
1007                 if(sysctl_tcp_rfc1337 == 0) {
1008                         tcp_tw_deschedule(tw);
1009                         tcp_timewait_kill(tw);
1010                 }
1011                 if(!th->rst)
1012                         return 1; /* toss a reset back */
1013         } else {
1014                 /* In this case we must reset the TIMEWAIT timer. */
1015                 if(th->ack)
1016                         tcp_tw_reschedule(tw);
1017         }
1018         return 0; /* Discard the frame. */
1019 }
1020
1021 /* Enter the time wait state.  This is always called from BH
1022  * context.  Essentially we whip up a timewait bucket, copy the
1023  * relevant info into it from the SK, and mess with hash chains
1024  * and list linkage.
1025  */
1026 static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1027 {
1028         struct sock **head, *sktw;
1029
1030         /* Step 1: Remove SK from established hash. */
1031         if(sk->next)
1032                 sk->next->pprev = sk->pprev;
1033         *sk->pprev = sk->next;
1034         sk->pprev = NULL;
1035         tcp_reg_zap(sk);
1036
1037         /* Step 2: Put TW into bind hash where SK was. */
1038         tw->tb = (struct tcp_bind_bucket *)sk->prev;
1039         if((tw->bind_next = sk->bind_next) != NULL)
1040                 sk->bind_next->bind_pprev = &tw->bind_next;
1041         tw->bind_pprev = sk->bind_pprev;
1042         *sk->bind_pprev = (struct sock *)tw;
1043
1044         /* Step 3: Same for the protocol sklist. */
1045         (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
1046         (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
1047         sk->sklist_next = NULL;
1048         sk->prot->inuse--;
1049
1050         /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
1051         head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
1052         sktw = (struct sock *)tw;
1053         if((sktw->next = *head) != NULL)
1054                 (*head)->pprev = &sktw->next;
1055         *head = sktw;
1056         sktw->pprev = head;
1057 }
1058
1059 void tcp_time_wait(struct sock *sk)
1060 {
1061         struct tcp_tw_bucket *tw;
1062
1063         tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1064         if(tw != NULL) {
1065                 /* Give us an identity. */
1066                 tw->daddr       = sk->daddr;
1067                 tw->rcv_saddr   = sk->rcv_saddr;
1068                 tw->bound_dev_if= sk->bound_dev_if;
1069                 tw->num         = sk->num;
1070                 tw->state       = TCP_TIME_WAIT;
1071                 tw->sport       = sk->sport;
1072                 tw->dport       = sk->dport;
1073                 tw->family      = sk->family;
1074                 tw->reuse       = sk->reuse;
1075                 tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
1076                 tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
1077
1078 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1079                 if(tw->family == PF_INET6) {
1080                         memcpy(&tw->v6_daddr,
1081                                &sk->net_pinfo.af_inet6.daddr,
1082                                sizeof(struct in6_addr));
1083                         memcpy(&tw->v6_rcv_saddr,
1084                                &sk->net_pinfo.af_inet6.rcv_saddr,
1085                                sizeof(struct in6_addr));
1086                 }
1087 #endif
1088                 /* Linkage updates. */
1089                 SOCKHASH_LOCK_WRITE();
1090                 tcp_tw_hashdance(sk, tw);
1091                 SOCKHASH_UNLOCK_WRITE();
1092
1093                 /* Get the TIME_WAIT timeout firing. */
1094                 tcp_tw_schedule(tw);
1095
1096                 /* CLOSE the SK. */
1097                 if(sk->state == TCP_ESTABLISHED)
1098                         tcp_statistics.TcpCurrEstab--;
1099                 sk->state = TCP_CLOSE;
1100                 net_reset_timer(sk, TIME_DONE,
1101                                 min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1102         } else {
1103                 /* Sorry, we're out of memory, just CLOSE this
1104                  * socket up.  We've got bigger problems than
1105                  * non-graceful socket closings.
1106                  */
1107                 tcp_set_state(sk, TCP_CLOSE);
1108         }
1109
1110         /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1111         sk->shutdown = SHUTDOWN_MASK;
1112         if(!sk->dead)
1113                 sk->state_change(sk);
1114 }
1115
1116 /*
1117  *      Process the FIN bit. This now behaves as it is supposed to work
1118  *      and the FIN takes effect when it is validly part of sequence
1119  *      space. Not before when we get holes.
1120  *
1121  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1122  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1123  *      TIME-WAIT)
1124  *
1125  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1126  *      close and we go into CLOSING (and later onto TIME-WAIT)
1127  *
1128  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1129  */
1130
1131 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1132 {
1133         sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1134
1135         tcp_send_ack(sk);
1136
1137         if (!sk->dead) {
1138                 sk->state_change(sk);
1139                 sock_wake_async(sk->socket, 1);
1140         }
1141
1142         switch(sk->state) {
1143                 case TCP_SYN_RECV:
1144                 case TCP_ESTABLISHED:
1145                         /* Move to CLOSE_WAIT */
1146                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1147                         if (th->rst)
1148                                 sk->shutdown = SHUTDOWN_MASK;
1149                         break;
1150
1151                 case TCP_CLOSE_WAIT:
1152                 case TCP_CLOSING:
1153                         /* Received a retransmission of the FIN, do
1154                          * nothing.
1155                          */
1156                         break;
1157                 case TCP_LAST_ACK:
1158                         /* RFC793: Remain in the LAST-ACK state. */
1159                         break;
1160
1161                 case TCP_FIN_WAIT1:
1162                         /* This case occurs when a simultaneous close
1163                          * happens, we must ack the received FIN and
1164                          * enter the CLOSING state.
1165                          *
1166                          * This causes a WRITE timeout, which will either
1167                          * move on to TIME_WAIT when we timeout, or resend
1168                          * the FIN properly (maybe we get rid of that annoying
1169                          * FIN lost hang). The TIME_WRITE code is already
1170                          * correct for handling this timeout.
1171                          */
1172                         tcp_set_state(sk, TCP_CLOSING);
1173                         break;
1174                 case TCP_FIN_WAIT2:
1175                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1176                         tcp_time_wait(sk);
1177                         break;
1178                 default:
1179                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1180                          * cases we should never reach this piece of code.
1181                          */
1182                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1183                         break;
1184         };
1185 }
1186
1187 /* These routines update the SACK block as out-of-order packets arrive or
1188  * in-order packets close up the sequence space.
1189  */
1190 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1191 {
1192         int this_sack, num_sacks = tp->num_sacks;
1193         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1194
1195         /* If more than one SACK block, see if the recent change to SP eats into
1196          * or hits the sequence space of other SACK blocks, if so coalesce.
1197          */
1198         if(num_sacks != 1) {
1199                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1200                         if(swalk == sp)
1201                                 continue;
1202
1203                         /* First case, bottom of SP moves into top of the
1204                          * sequence space of SWALK.
1205                          */
1206                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1207                                 sp->start_seq = swalk->start_seq;
1208                                 goto coalesce;
1209                         }
1210                         /* Second case, top of SP moves into bottom of the
1211                          * sequence space of SWALK.
1212                          */
1213                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1214                                 sp->end_seq = swalk->end_seq;
1215                                 goto coalesce;
1216                         }
1217                 }
1218         }
1219         /* SP is the only SACK, or no coalescing cases found. */
1220         return;
1221
1222 coalesce:
1223         /* Zap SWALK, by moving every further SACK up by one slot.
1224          * Decrease num_sacks.
1225          */
1226         for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1227                 struct tcp_sack_block *next = (swalk + 1);
1228                 swalk->start_seq = next->start_seq;
1229                 swalk->end_seq = next->end_seq;
1230         }
1231         tp->num_sacks--;
1232 }
1233
1234 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1235 {
1236         __u32 tmp;
1237
1238         tmp = sack1->start_seq;
1239         sack1->start_seq = sack2->start_seq;
1240         sack2->start_seq = tmp;
1241
1242         tmp = sack1->end_seq;
1243         sack1->end_seq = sack2->end_seq;
1244         sack2->end_seq = tmp;
1245 }
1246
1247 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1248 {
1249         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1250         struct tcp_sack_block *sp = &tp->selective_acks[0];
1251         int cur_sacks = tp->num_sacks;
1252
1253         if (!cur_sacks)
1254                 goto new_sack;
1255
1256         /* Optimize for the common case, new ofo frames arrive
1257          * "in order". ;-)  This also satisfies the requirements
1258          * of RFC2018 about ordering of SACKs.
1259          */
1260         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1261                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1262                 tcp_sack_maybe_coalesce(tp, sp);
1263         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1264                 /* Re-ordered arrival, in this case, can be optimized
1265                  * as well.
1266                  */
1267                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1268                 tcp_sack_maybe_coalesce(tp, sp);
1269         } else {
1270                 struct tcp_sack_block *swap = sp + 1;
1271                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1272
1273                 /* Oh well, we have to move things around.
1274                  * Try to find a SACK we can tack this onto.
1275                  */
1276
1277                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1278                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1279                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1280                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1281                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1282                                 else
1283                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1284                                 tcp_sack_swap(sp, swap);
1285                                 tcp_sack_maybe_coalesce(tp, sp);
1286                                 return;
1287                         }
1288                 }
1289
1290                 /* Could not find an adjacent existing SACK, build a new one,
1291                  * put it at the front, and shift everyone else down.  We
1292                  * always know there is at least one SACK present already here.
1293                  *
1294                  * If the sack array is full, forget about the last one.
1295                  */
1296                 if (cur_sacks >= max_sacks) {
1297                         cur_sacks--;
1298                         tp->num_sacks--;
1299                 }
1300                 while(cur_sacks >= 1) {
1301                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1302                         struct tcp_sack_block *prev = (this - 1);
1303                         this->start_seq = prev->start_seq;
1304                         this->end_seq = prev->end_seq;
1305                         cur_sacks--;
1306                 }
1307
1308         new_sack:
1309                 /* Build the new head SACK, and we're done. */
1310                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1311                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1312                 tp->num_sacks++;
1313         }
1314 }
1315
1316 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1317 {
1318         struct tcp_sack_block *sp = &tp->selective_acks[0];
1319         int num_sacks = tp->num_sacks;
1320         int this_sack;
1321
1322         /* This is an in order data segment _or_ an out-of-order SKB being
1323          * moved to the receive queue, so we know this removed SKB will eat
1324          * from the front of a SACK.
1325          */
1326         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1327                 /* Check if the start of the sack is covered by skb. */
1328                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1329                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1330                         break;
1331         }
1332
1333         /* This should only happen if so many SACKs get built that some get
1334          * pushed out before we get here, or we eat some in sequence packets
1335          * which are before the first SACK block.
1336          */
1337         if(this_sack >= num_sacks)
1338                 return;
1339
1340         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1341         if(!before(sp->start_seq, sp->end_seq)) {
1342                 /* Zap this SACK, by moving forward any other SACKS. */
1343                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1344                         struct tcp_sack_block *next = (sp + 1);
1345                         sp->start_seq = next->start_seq;
1346                         sp->end_seq = next->end_seq;
1347                 }
1348                 tp->num_sacks--;
1349         }
1350 }
1351
1352 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1353 {
1354         struct tcp_sack_block *sp = &tp->selective_acks[0];
1355         int num_sacks = tp->num_sacks;
1356         int this_sack;
1357
1358         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1359                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1360                         break;
1361         }
1362         if(this_sack >= num_sacks)
1363                 return;
1364         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1365 }
1366
1367 /* This one checks to see if we can put data from the
1368  * out_of_order queue into the receive_queue.
1369  */
1370 static void tcp_ofo_queue(struct sock *sk)
1371 {
1372         struct sk_buff *skb;
1373         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1374
1375         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1376                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1377                         break;
1378
1379                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1380                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1381                         __skb_unlink(skb, skb->list);
1382                         kfree_skb(skb);
1383                         continue;
1384                 }
1385                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1386                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1387                            TCP_SKB_CB(skb)->end_seq);
1388
1389                 if(tp->sack_ok)
1390                         tcp_sack_remove_skb(tp, skb);
1391                 __skb_unlink(skb, skb->list);
1392                 __skb_queue_tail(&sk->receive_queue, skb);
1393                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1394                 if(skb->h.th->fin)
1395                         tcp_fin(skb, sk, skb->h.th);
1396         }
1397 }
1398
1399 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1400 {
1401         struct sk_buff *skb1;
1402         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1403
1404         /*  Queue data for delivery to the user.
1405          *  Packets in sequence go to the receive queue.
1406          *  Out of sequence packets to the out_of_order_queue.
1407          */
1408         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1409                 /* Ok. In sequence. */
1410         queue_and_out:
1411                 dst_confirm(sk->dst_cache);
1412                 __skb_queue_tail(&sk->receive_queue, skb);
1413                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1414                 if(skb->h.th->fin) {
1415                         tcp_fin(skb, sk, skb->h.th);
1416                 } else {
1417                         tcp_remember_ack(tp, skb->h.th, skb);
1418                 }
1419                 /* This may have eaten into a SACK block. */
1420                 if(tp->sack_ok && tp->num_sacks)
1421                         tcp_sack_remove_skb(tp, skb);
1422                 tcp_ofo_queue(sk);
1423
1424                 /* Turn on fast path. */
1425                 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1426                         tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1427                                                (0x10 << 16) |
1428                                                tp->snd_wnd);
1429                 return;
1430         }
1431
1432         /* An old packet, either a retransmit or some packet got lost. */
1433         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1434                 /* A retransmit, 2nd most common case.  Force an imediate ack. */
1435                 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1436                 tcp_enter_quickack_mode(tp);
1437                 kfree_skb(skb);
1438                 return;
1439         }
1440
1441         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1442                 /* Partial packet, seq < rcv_next < end_seq */
1443                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1444                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1445                            TCP_SKB_CB(skb)->end_seq);
1446
1447                 goto queue_and_out;
1448         }
1449
1450         /* Ok. This is an out_of_order segment, force an ack. */
1451         tp->delayed_acks++;
1452         tcp_enter_quickack_mode(tp);
1453
1454         /* Disable header prediction. */
1455         tp->pred_flags = 0;
1456
1457         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1458                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1459
1460         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1461                 /* Initial out of order segment, build 1 SACK. */
1462                 if(tp->sack_ok) {
1463                         tp->num_sacks = 1;
1464                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1465                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1466                 }
1467                 __skb_queue_head(&tp->out_of_order_queue,skb);
1468         } else {
1469                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1470                         /* Already there. */
1471                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1472                                 if (skb->len >= skb1->len) {
1473                                         if(tp->sack_ok)
1474                                                 tcp_sack_extend(tp, skb1, skb);
1475                                         __skb_append(skb1, skb);
1476                                         __skb_unlink(skb1, skb1->list);
1477                                         kfree_skb(skb1);
1478                                 } else {
1479                                         /* A duplicate, smaller than what is in the
1480                                          * out-of-order queue right now, toss it.
1481                                          */
1482                                         kfree_skb(skb);
1483                                 }
1484                                 break;
1485                         }
1486
1487                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1488                                 __skb_append(skb1, skb);
1489                                 if(tp->sack_ok)
1490                                         tcp_sack_new_ofo_skb(sk, skb);
1491                                 break;
1492                         }
1493
1494                         /* See if we've hit the start. If so insert. */
1495                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1496                                 __skb_queue_head(&tp->out_of_order_queue,skb);
1497                                 if(tp->sack_ok)
1498                                         tcp_sack_new_ofo_skb(sk, skb);
1499                                 break;
1500                         }
1501                 }
1502         }
1503 }
1504
1505
1506 /*
1507  *      This routine handles the data.  If there is room in the buffer,
1508  *      it will be have already been moved into it.  If there is no
1509  *      room, then we will just have to discard the packet.
1510  */
1511
1512 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1513 {
1514         struct tcphdr *th;
1515         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1516
1517         th = skb->h.th;
1518         skb_pull(skb, th->doff*4);
1519         skb_trim(skb, len - (th->doff*4));
1520
1521         if (skb->len == 0 && !th->fin)
1522                 return(0);
1523
1524         /*
1525          *      If our receive queue has grown past its limits shrink it.
1526          *      Make sure to do this before moving snd_nxt, otherwise
1527          *      data might be acked for that we don't have enough room.
1528          */
1529         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1530                 if (prune_queue(sk) < 0) {
1531                         /* Still not enough room. That can happen when
1532                          * skb->true_size differs significantly from skb->len.
1533                          */
1534                         return 0;
1535                 }
1536         }
1537
1538         tcp_data_queue(sk, skb);
1539
1540         if (before(tp->rcv_nxt, tp->copied_seq)) {
1541                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1542                 tp->rcv_nxt = tp->copied_seq;
1543         }
1544
1545         /* Above, tcp_data_queue() increments delayed_acks appropriately.
1546          * Now tell the user we may have some data.
1547          */
1548         if (!sk->dead) {
1549                 SOCK_DEBUG(sk, "Data wakeup.\n");
1550                 sk->data_ready(sk,0);
1551         }
1552         return(1);
1553 }
1554
1555 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1556 {
1557         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1558
1559         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1560             tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1561                 /* Put more data onto the wire. */
1562                 tcp_write_xmit(sk);
1563         } else if (tp->packets_out == 0 && !tp->pending) {
1564                 /* Start probing the receivers window. */
1565                 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1566         }
1567 }
1568
1569 static __inline__ void tcp_data_snd_check(struct sock *sk)
1570 {
1571         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1572
1573         if (skb != NULL)
1574                 __tcp_data_snd_check(sk, skb);
1575 }
1576
1577 /*
1578  * Adapt the MSS value used to make delayed ack decision to the
1579  * real world.
1580  */
1581 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1582 {
1583         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1584         unsigned int len = skb->len, lss;
1585
1586         if (len > tp->rcv_mss)
1587                 tp->rcv_mss = len;
1588         lss = tp->last_seg_size;
1589         tp->last_seg_size = 0;
1590         if (len >= 536) {
1591                 if (len == lss)
1592                         tp->rcv_mss = len;
1593                 tp->last_seg_size = len;
1594         }
1595 }
1596
1597 /*
1598  * Check if sending an ack is needed.
1599  */
1600 static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1601 {
1602         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1603
1604         /* This also takes care of updating the window.
1605          * This if statement needs to be simplified.
1606          *
1607          * Rules for delaying an ack:
1608          *      - delay time <= 0.5 HZ
1609          *      - we don't have a window update to send
1610          *      - must send at least every 2 full sized packets
1611          *      - must send an ACK if we have any out of order data
1612          *
1613          * With an extra heuristic to handle loss of packet
1614          * situations and also helping the sender leave slow
1615          * start in an expediant manner.
1616          */
1617
1618             /* Two full frames received or... */
1619         if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1620             /* We will update the window "significantly" or... */
1621             tcp_raise_window(sk) ||
1622             /* We entered "quick ACK" mode or... */
1623             tcp_in_quickack_mode(tp) ||
1624             /* We have out of order data */
1625             (skb_peek(&tp->out_of_order_queue) != NULL)) {
1626                 /* Then ack it now */
1627                 tcp_send_ack(sk);
1628         } else {
1629                 /* Else, send delayed ack. */
1630                 tcp_send_delayed_ack(tp, HZ/2);
1631         }
1632 }
1633
1634 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1635 {
1636         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1637         if (tp->delayed_acks == 0) {
1638                 /* We sent a data segment already. */
1639                 return;
1640         }
1641         __tcp_ack_snd_check(sk);
1642 }
1643
1644
1645 /*
1646  *      This routine is only called when we have urgent data
1647  *      signalled. Its the 'slow' part of tcp_urg. It could be
1648  *      moved inline now as tcp_urg is only called from one
1649  *      place. We handle URGent data wrong. We have to - as
1650  *      BSD still doesn't use the correction from RFC961.
1651  *      For 1003.1g we should support a new option TCP_STDURG to permit
1652  *      either form (or just set the sysctl tcp_stdurg).
1653  */
1654
1655 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1656 {
1657         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1658         u32 ptr = ntohs(th->urg_ptr);
1659
1660         if (ptr && !sysctl_tcp_stdurg)
1661                 ptr--;
1662         ptr += ntohl(th->seq);
1663
1664         /* Ignore urgent data that we've already seen and read. */
1665         if (after(tp->copied_seq, ptr))
1666                 return;
1667
1668         /* Do we already have a newer (or duplicate) urgent pointer? */
1669         if (tp->urg_data && !after(ptr, tp->urg_seq))
1670                 return;
1671
1672         /* Tell the world about our new urgent pointer. */
1673         if (sk->proc != 0) {
1674                 if (sk->proc > 0)
1675                         kill_proc(sk->proc, SIGURG, 1);
1676                 else
1677                         kill_pg(-sk->proc, SIGURG, 1);
1678         }
1679
1680         /* We may be adding urgent data when the last byte read was
1681          * urgent. To do this requires some care. We cannot just ignore
1682          * tp->copied_seq since we would read the last urgent byte again
1683          * as data, nor can we alter copied_seq until this data arrives
1684          * or we break the sematics of SIOCATMARK (and thus sockatmark())
1685          */
1686         if (tp->urg_seq == tp->copied_seq)
1687                 tp->copied_seq++;       /* Move the copied sequence on correctly */
1688         tp->urg_data = URG_NOTYET;
1689         tp->urg_seq = ptr;
1690
1691         /* Disable header prediction. */
1692         tp->pred_flags = 0;
1693 }
1694
1695 /* This is the 'fast' part of urgent handling. */
1696 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1697 {
1698         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1699
1700         /* Check if we get a new urgent pointer - normally not. */
1701         if (th->urg)
1702                 tcp_check_urg(sk,th);
1703
1704         /* Do we wait for any urgent data? - normally not... */
1705         if (tp->urg_data == URG_NOTYET) {
1706                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1707
1708                 /* Is the urgent pointer pointing into this packet? */
1709                 if (ptr < len) {
1710                         tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1711                         if (!sk->dead)
1712                                 sk->data_ready(sk,0);
1713                 }
1714         }
1715 }
1716
1717 /* Clean the out_of_order queue if we can, trying to get
1718  * the socket within its memory limits again.
1719  *
1720  * Return less than zero if we should start dropping frames
1721  * until the socket owning process reads some of the data
1722  * to stabilize the situation.
1723  */
1724 static int prune_queue(struct sock *sk)
1725 {
1726         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1727         struct sk_buff * skb;
1728
1729         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1730
1731         net_statistics.PruneCalled++;
1732
1733         /* First, purge the out_of_order queue. */
1734         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1735         if(skb != NULL) {
1736                 /* Free it all. */
1737                 do {    net_statistics.OfoPruned += skb->len;
1738                         kfree_skb(skb);
1739                         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1740                 } while(skb != NULL);
1741
1742                 /* Reset SACK state.  A conforming SACK implementation will
1743                  * do the same at a timeout based retransmit.  When a connection
1744                  * is in a sad state like this, we care only about integrity
1745                  * of the connection not performance.
1746                  */
1747                 if(tp->sack_ok)
1748                         tp->num_sacks = 0;
1749         }
1750
1751         /* If we are really being abused, tell the caller to silently
1752          * drop receive data on the floor.  It will get retransmitted
1753          * and hopefully then we'll have sufficient space.
1754          *
1755          * We used to try to purge the in-order packets too, but that
1756          * turns out to be deadly and fraught with races.  Consider:
1757          *
1758          * 1) If we acked the data, we absolutely cannot drop the
1759          *    packet.  This data would then never be retransmitted.
1760          * 2) It is possible, with a proper sequence of events involving
1761          *    delayed acks and backlog queue handling, to have the user
1762          *    read the data before it gets acked.  The previous code
1763          *    here got this wrong, and it lead to data corruption.
1764          * 3) Too much state changes happen when the FIN arrives, so once
1765          *    we've seen that we can't remove any in-order data safely.
1766          *
1767          * The net result is that removing in-order receive data is too
1768          * complex for anyones sanity.  So we don't do it anymore.  But
1769          * if we are really having our buffer space abused we stop accepting
1770          * new receive data.
1771          */
1772         if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
1773                 return 0;
1774
1775         /* Massive buffer overcommit. */
1776         return -1;
1777 }
1778
1779 /*
1780  *      TCP receive function for the ESTABLISHED state.
1781  *
1782  *      It is split into a fast path and a slow path. The fast path is
1783  *      disabled when:
1784  *      - A zero window was announced from us - zero window probing
1785  *        is only handled properly in the slow path.
1786  *      - Out of order segments arrived.
1787  *      - Urgent data is expected.
1788  *      - There is no buffer space left
1789  *      - Unexpected TCP flags/window values/header lengths are received
1790  *        (detected by checking the TCP header against pred_flags)
1791  *      - Data is sent in both directions. Fast path only supports pure senders
1792  *        or pure receivers (this means either the sequence number or the ack
1793  *        value must stay constant)
1794  *
1795  *      When these conditions are not satisfied it drops into a standard
1796  *      receive procedure patterned after RFC793 to handle all cases.
1797  *      The first three cases are guaranteed by proper pred_flags setting,
1798  *      the rest is checked inline. Fast processing is turned on in
1799  *      tcp_data_queue when everything is OK.
1800  */
1801 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1802                         struct tcphdr *th, unsigned len)
1803 {
1804         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1805         int queued;
1806         u32 flg;
1807
1808         /*
1809          *      Header prediction.
1810          *      The code follows the one in the famous
1811          *      "30 instruction TCP receive" Van Jacobson mail.
1812          *
1813          *      Van's trick is to deposit buffers into socket queue
1814          *      on a device interrupt, to call tcp_recv function
1815          *      on the receive process context and checksum and copy
1816          *      the buffer to user space. smart...
1817          *
1818          *      Our current scheme is not silly either but we take the
1819          *      extra cost of the net_bh soft interrupt processing...
1820          *      We do checksum and copy also but from device to kernel.
1821          */
1822
1823         /*
1824          * RFC1323: H1. Apply PAWS check first.
1825          */
1826         if (tcp_fast_parse_options(sk, th, tp)) {
1827                 if (tp->saw_tstamp) {
1828                         if (tcp_paws_discard(tp, th, len)) {
1829                                 tcp_statistics.TcpInErrs++;
1830                                 if (!th->rst) {
1831                                         tcp_send_ack(sk);
1832                                         goto discard;
1833                                 }
1834                         }
1835                         tcp_replace_ts_recent(sk, tp,
1836                                               TCP_SKB_CB(skb)->seq,
1837                                               TCP_SKB_CB(skb)->end_seq);
1838                 }
1839         }
1840
1841         flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1842
1843         /*      pred_flags is 0xS?10 << 16 + snd_wnd
1844          *      if header_predition is to be made
1845          *      'S' will always be tp->tcp_header_len >> 2
1846          *      '?' will be 0 else it will be !0
1847          *      (when there are holes in the receive
1848          *       space for instance)
1849          *      PSH flag is ignored.
1850          */
1851
1852         if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1853                 if (len <= th->doff*4) {
1854                         /* Bulk data transfer: sender */
1855                         if (len == th->doff*4) {
1856                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1857                                         TCP_SKB_CB(skb)->ack_seq, len);
1858                                 kfree_skb(skb);
1859                                 tcp_data_snd_check(sk);
1860                                 return 0;
1861                         } else { /* Header too small */
1862                                 tcp_statistics.TcpInErrs++;
1863                                 goto discard;
1864                         }
1865                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1866                            atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1867                         /* Bulk data transfer: receiver */
1868                         __skb_pull(skb,th->doff*4);
1869
1870                         tcp_measure_rcv_mss(sk, skb);
1871
1872                         /* DO NOT notify forward progress here.
1873                          * It saves dozen of CPU instructions in fast path. --ANK
1874                          */
1875                         __skb_queue_tail(&sk->receive_queue, skb);
1876                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1877
1878                         /* FIN bit check is not done since if FIN is set in
1879                          * this frame, the pred_flags won't match up. -DaveM
1880                          */
1881                         sk->data_ready(sk, 0);
1882                         tcp_delack_estimator(tp);
1883
1884                         tcp_remember_ack(tp, th, skb);
1885
1886                         __tcp_ack_snd_check(sk);
1887                         return 0;
1888                 }
1889         }
1890
1891         /*
1892          *      Standard slow path.
1893          */
1894
1895         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1896                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
1897                  * (RST) segments are validated by checking their SEQ-fields."
1898                  * And page 69: "If an incoming segment is not acceptable,
1899                  * an acknowledgment should be sent in reply (unless the RST bit
1900                  * is set, if so drop the segment and return)".
1901                  */
1902                 if (th->rst)
1903                         goto discard;
1904                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1905                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1906                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1907                                    tp->rcv_wup, tp->rcv_wnd);
1908                 }
1909                 tcp_send_ack(sk);
1910                 goto discard;
1911         }
1912
1913         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1914                 SOCK_DEBUG(sk, "syn in established state\n");
1915                 tcp_statistics.TcpInErrs++;
1916                 tcp_reset(sk);
1917                 return 1;
1918         }
1919
1920         if(th->rst) {
1921                 tcp_reset(sk);
1922                 goto discard;
1923         }
1924
1925         if(th->ack)
1926                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1927
1928         /* Process urgent data. */
1929         tcp_urg(sk, th, len);
1930
1931         /* step 7: process the segment text */
1932         queued = tcp_data(skb, sk, len);
1933
1934         /* This must be after tcp_data() does the skb_pull() to
1935          * remove the header size from skb->len.
1936          *
1937          * Dave!!! Phrase above (and all about rcv_mss) has
1938          * nothing to do with reality. rcv_mss must measure TOTAL
1939          * size, including sacks, IP options etc. Hence, measure_rcv_mss
1940          * must occure before pulling etc, otherwise it will flap
1941          * like hell. Even putting it before tcp_data is wrong,
1942          * it should use skb->tail - skb->nh.raw instead.
1943          *                                      --ANK (980805)
1944          *
1945          * BTW I broke it. Now all TCP options are handled equally
1946          * in mss_clamp calculations (i.e. ignored, rfc1122),
1947          * and mss_cache does include all of them (i.e. tstamps)
1948          * except for sacks, to calulate effective mss faster.
1949          *                                      --ANK (980805)
1950          */
1951         tcp_measure_rcv_mss(sk, skb);
1952
1953         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1954         if(sk->state != TCP_CLOSE) {
1955                 tcp_data_snd_check(sk);
1956                 tcp_ack_snd_check(sk);
1957         }
1958
1959         if (!queued) {
1960         discard:
1961                 kfree_skb(skb);
1962         }
1963
1964         return 0;
1965 }
1966
1967 /*
1968  *      Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1969  *      as an open_request.
1970  */
1971
1972 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
1973                            struct open_request *req)
1974 {
1975         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1976         u32 flg;
1977
1978         /*      assumption: the socket is not in use.
1979          *      as we checked the user count on tcp_rcv and we're
1980          *      running from a soft interrupt.
1981          */
1982
1983         /* Check for syn retransmission */
1984         flg = *(((u32 *)skb->h.th) + 3);
1985
1986         flg &= __constant_htonl(0x00170000);
1987         /* Only SYN set? */
1988         if (flg == __constant_htonl(0x00020000)) {
1989                 if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1990                         /*      retransmited syn.
1991                          */
1992                         req->class->rtx_syn_ack(sk, req);
1993                         return NULL;
1994                 } else {
1995                         return sk; /* Pass new SYN to the listen socket. */
1996                 }
1997         }
1998
1999         /* We know it's an ACK here */
2000         if (req->sk) {
2001                 /*      socket already created but not
2002                  *      yet accepted()...
2003                  */
2004                 sk = req->sk;
2005         } else {
2006                 /* In theory the packet could be for a cookie, but
2007                  * TIME_WAIT should guard us against this.
2008                  * XXX: Nevertheless check for cookies?
2009                  * This sequence number check is done again later,
2010                  * but we do it here to prevent syn flood attackers
2011                  * from creating big SYN_RECV sockets.
2012                  */
2013                 if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
2014                     !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
2015                              req->rcv_isn+1+req->rcv_wnd)) {
2016                         req->class->send_reset(skb);
2017                         return NULL;
2018                 }
2019
2020                 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2021                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2022                 if (sk == NULL)
2023                         return NULL;
2024
2025                 req->expires = 0UL;
2026                 req->sk = sk;
2027         }
2028         skb_orphan(skb);
2029         skb_set_owner_r(skb, sk);
2030         return sk;
2031 }
2032
2033 /*
2034  *      This function implements the receiving procedure of RFC 793 for
2035  *      all states except ESTABLISHED and TIME_WAIT.
2036  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2037  *      address independent.
2038  */
2039
2040 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2041                           struct tcphdr *th, unsigned len)
2042 {
2043         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2044         int queued = 0;
2045
2046         switch (sk->state) {
2047         case TCP_CLOSE:
2048                 /* When state == CLOSED, hash lookup always fails.
2049                  *
2050                  * But, there is a back door, the backlog queue.
2051                  * If we have a sequence of packets in the backlog
2052                  * during __release_sock() which have a sequence such
2053                  * that:
2054                  *      packet X        causes entry to TCP_CLOSE state
2055                  *      ...
2056                  *      packet X + N    has FIN bit set
2057                  *
2058                  * We report a (luckily) harmless error in this case.
2059                  * The issue is that backlog queue processing bypasses
2060                  * any hash lookups (we know which socket packets are for).
2061                  * The correct behavior here is what 2.0.x did, since
2062                  * a TCP_CLOSE socket does not exist.  Drop the frame
2063                  * and send a RST back to the other end.
2064                  */
2065                 return 1;
2066
2067         case TCP_LISTEN:
2068                 /* These use the socket TOS..
2069                  * might want to be the received TOS
2070                  */
2071                 if(th->ack)
2072                         return 1;
2073
2074                 if(th->syn) {
2075                         if(tp->af_specific->conn_request(sk, skb, 0) < 0)
2076                                 return 1;
2077
2078                         /* Now we have several options: In theory there is
2079                          * nothing else in the frame. KA9Q has an option to
2080                          * send data with the syn, BSD accepts data with the
2081                          * syn up to the [to be] advertised window and
2082                          * Solaris 2.1 gives you a protocol error. For now
2083                          * we just ignore it, that fits the spec precisely
2084                          * and avoids incompatibilities. It would be nice in
2085                          * future to drop through and process the data.
2086                          *
2087                          * Now that TTCP is starting to be used we ought to
2088                          * queue this data.
2089                          * But, this leaves one open to an easy denial of
2090                          * service attack, and SYN cookies can't defend
2091                          * against this problem. So, we drop the data
2092                          * in the interest of security over speed.
2093                          */
2094                         goto discard;
2095                 }
2096
2097                 goto discard;
2098                 break;
2099
2100         case TCP_SYN_SENT:
2101                 /* SYN sent means we have to look for a suitable ack and
2102                  * either reset for bad matches or go to connected.
2103                  * The SYN_SENT case is unusual and should
2104                  * not be in line code. [AC]
2105                  */
2106                 if(th->ack) {
2107                         /* rfc793:
2108                          * "If the state is SYN-SENT then
2109                          *    first check the ACK bit
2110                          *      If the ACK bit is set
2111                          *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2112                          *        a reset (unless the RST bit is set, if so drop
2113                          *        the segment and return)"
2114                          *
2115                          *  I cite this place to emphasize one essential
2116                          *  detail, this check is different of one
2117                          *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2118                          *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2119                          *  because we have no previous data sent before SYN.
2120                          *                                        --ANK(990513)
2121                          *
2122                          *  We do not send data with SYN, so that RFC-correct
2123                          *  test reduces to:
2124                          */
2125                         if (sk->zapped ||
2126                             TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2127                                 return 1;
2128
2129                         /* Now ACK is acceptable.
2130                          *
2131                          * "If the RST bit is set
2132                          *    If the ACK was acceptable then signal the user "error:
2133                          *    connection reset", drop the segment, enter CLOSED state,
2134                          *    delete TCB, and return."
2135                          */
2136
2137                         if (th->rst) {
2138                                 tcp_reset(sk);
2139                                 goto discard;
2140                         }
2141
2142                         /* rfc793:
2143                          *   "fifth, if neither of the SYN or RST bits is set then
2144                          *    drop the segment and return."
2145                          *
2146                          *    See note below!
2147                          *                                        --ANK(990513)
2148                          */
2149
2150                         if (!th->syn)
2151                                 goto discard;
2152
2153                         /* rfc793:
2154                          *   "If the SYN bit is on ...
2155                          *    are acceptable then ...
2156                          *    (our SYN has been ACKed), change the connection
2157                          *    state to ESTABLISHED..."
2158                          *
2159                          * Do you see? SYN-less ACKs in SYN-SENT state are
2160                          * completely ignored.
2161                          *
2162                          * The bug causing stalled SYN-SENT sockets
2163                          * was here: tcp_ack advanced snd_una and canceled
2164                          * retransmit timer, so that bare ACK received
2165                          * in SYN-SENT state (even with invalid ack==ISS,
2166                          * because tcp_ack check is too weak for SYN-SENT)
2167                          * causes moving socket to invalid semi-SYN-SENT,
2168                          * semi-ESTABLISHED state and connection hangs.
2169                          *
2170                          * There exist buggy stacks, which really send
2171                          * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2172                          * Actually, if this host did not try to get something
2173                          * from ftp.inr.ac.ru I'd never find this bug 8)
2174                          *
2175                          *                                     --ANK (990514)
2176                          */
2177
2178                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2179                         tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2180                                 TCP_SKB_CB(skb)->ack_seq, len);
2181
2182                         /* Ok.. it's good. Set up sequence numbers and
2183                          * move to established.
2184                          */
2185                         tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2186                         tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2187
2188                         /* RFC1323: The window in SYN & SYN/ACK segments is
2189                          * never scaled.
2190                          */
2191                         tp->snd_wnd = htons(th->window);
2192                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2193                         tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2194                         tp->fin_seq = TCP_SKB_CB(skb)->seq;
2195
2196                         tcp_set_state(sk, TCP_ESTABLISHED);
2197                         tcp_parse_options(sk, th, tp, 0);
2198
2199                         if (tp->wscale_ok == 0) {
2200                                 tp->snd_wscale = tp->rcv_wscale = 0;
2201                                 tp->window_clamp = min(tp->window_clamp,65535);
2202                         }
2203
2204                         if (tp->tstamp_ok) {
2205                                 tp->tcp_header_len =
2206                                         sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2207                         } else
2208                                 tp->tcp_header_len = sizeof(struct tcphdr);
2209                         if (tp->saw_tstamp) {
2210                                 tp->ts_recent = tp->rcv_tsval;
2211                                 tp->ts_recent_stamp = tcp_time_stamp;
2212                         }
2213
2214                         /* Can't be earlier, doff would be wrong. */
2215                         tcp_send_ack(sk);
2216
2217                         sk->dport = th->source;
2218                         tp->copied_seq = tp->rcv_nxt;
2219
2220                         if(!sk->dead) {
2221                                 sk->state_change(sk);
2222                                 sock_wake_async(sk->socket, 0);
2223                         }
2224                 } else {
2225                         if(th->syn && !th->rst) {
2226                                 /* The previous version of the code
2227                                  * checked for "connecting to self"
2228                                  * here. that check is done now in
2229                                  * tcp_connect.
2230                                  */
2231                                 tcp_set_state(sk, TCP_SYN_RECV);
2232                                 tcp_parse_options(sk, th, tp, 0);
2233                                 if (tp->saw_tstamp) {
2234                                         tp->ts_recent = tp->rcv_tsval;
2235                                         tp->ts_recent_stamp = tcp_time_stamp;
2236                                 }
2237
2238                                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2239                                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2240
2241                                 /* RFC1323: The window in SYN & SYN/ACK segments is
2242                                  * never scaled.
2243                                  */
2244                                 tp->snd_wnd = htons(th->window);
2245                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2246
2247                                 tcp_send_synack(sk);
2248                         } else
2249                                 break;
2250                 }
2251
2252                 /* tp->tcp_header_len and tp->mss_clamp
2253                    probably changed, synchronize mss.
2254                    */
2255                 tcp_sync_mss(sk, tp->pmtu_cookie);
2256                 tp->rcv_mss = tp->mss_cache;
2257
2258                 if (sk->state == TCP_SYN_RECV)
2259                         goto discard;
2260
2261                 goto step6;
2262         }
2263
2264         /*   Parse the tcp_options present on this header.
2265          *   By this point we really only expect timestamps.
2266          *   Note that this really has to be here and not later for PAWS
2267          *   (RFC1323) to work.
2268          */
2269         if (tcp_fast_parse_options(sk, th, tp)) {
2270                 /* NOTE: assumes saw_tstamp is never set if we didn't
2271                  * negotiate the option. tcp_fast_parse_options() must
2272                  * guarantee this.
2273                  */
2274                 if (tp->saw_tstamp) {
2275                         if (tcp_paws_discard(tp, th, len)) {
2276                                 tcp_statistics.TcpInErrs++;
2277                                 if (!th->rst) {
2278                                         tcp_send_ack(sk);
2279                                         goto discard;
2280                                 }
2281                         }
2282                         tcp_replace_ts_recent(sk, tp,
2283                                               TCP_SKB_CB(skb)->seq,
2284                                               TCP_SKB_CB(skb)->end_seq);
2285                 }
2286         }
2287
2288         /* The silly FIN test here is necessary to see an advancing ACK in
2289          * retransmitted FIN frames properly.  Consider the following sequence:
2290          *
2291          *      host1 --> host2         FIN XSEQ:XSEQ(0) ack YSEQ
2292          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ
2293          *      host1 --> host2         XSEQ:XSEQ(0) ack YSEQ+1
2294          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ+1     (fails tcp_sequence test)
2295          *
2296          * At this point the connection will deadlock with host1 believing
2297          * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2298          * forever.  The following fix is from Taral (taral@taral.net).
2299          */
2300
2301         /* step 1: check sequence number */
2302         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
2303             !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
2304                 if (!th->rst) {
2305                         tcp_send_ack(sk);
2306                         goto discard;
2307                 }
2308         }
2309
2310         /* step 2: check RST bit */
2311         if(th->rst) {
2312                 tcp_reset(sk);
2313                 goto discard;
2314         }
2315
2316         /* step 3: check security and precedence [ignored] */
2317
2318         /*      step 4:
2319          *
2320          *      Check for a SYN, and ensure it matches the SYN we were
2321          *      first sent. We have to handle the rather unusual (but valid)
2322          *      sequence that KA9Q derived products may generate of
2323          *
2324          *      SYN
2325          *                              SYN|ACK Data
2326          *      ACK     (lost)
2327          *                              SYN|ACK Data + More Data
2328          *      .. we must ACK not RST...
2329          *
2330          *      We keep syn_seq as the sequence space occupied by the
2331          *      original syn.
2332          */
2333
2334         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2335                 tcp_reset(sk);
2336                 return 1;
2337         }
2338
2339         /* step 5: check the ACK field */
2340         if (th->ack) {
2341                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2342                                          TCP_SKB_CB(skb)->ack_seq, len);
2343
2344                 switch(sk->state) {
2345                 case TCP_SYN_RECV:
2346                         if (acceptable) {
2347                                 tcp_set_state(sk, TCP_ESTABLISHED);
2348                                 sk->dport = th->source;
2349                                 tp->copied_seq = tp->rcv_nxt;
2350
2351                                 if(!sk->dead)
2352                                         sk->state_change(sk);
2353
2354                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2355                                 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2356                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2357                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2358
2359                         } else {
2360                                 SOCK_DEBUG(sk, "bad ack\n");
2361                                 return 1;
2362                         }
2363                         break;
2364
2365                 case TCP_FIN_WAIT1:
2366                         if (tp->snd_una == tp->write_seq) {
2367                                 sk->shutdown |= SEND_SHUTDOWN;
2368                                 tcp_set_state(sk, TCP_FIN_WAIT2);
2369                                 if (!sk->dead)
2370                                         sk->state_change(sk);
2371                                 else
2372                                         tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2373                         }
2374                         break;
2375
2376                 case TCP_CLOSING:
2377                         if (tp->snd_una == tp->write_seq) {
2378                                 tcp_time_wait(sk);
2379                                 goto discard;
2380                         }
2381                         break;
2382
2383                 case TCP_LAST_ACK:
2384                         if (tp->snd_una == tp->write_seq) {
2385                                 sk->shutdown = SHUTDOWN_MASK;
2386                                 tcp_set_state(sk,TCP_CLOSE);
2387                                 if (!sk->dead)
2388                                         sk->state_change(sk);
2389                                 goto discard;
2390                         }
2391                         break;
2392                 }
2393         } else
2394                 goto discard;
2395
2396 step6:
2397         /* step 6: check the URG bit */
2398         tcp_urg(sk, th, len);
2399
2400         /* step 7: process the segment text */
2401         switch (sk->state) {
2402         case TCP_CLOSE_WAIT:
2403         case TCP_CLOSING:
2404                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2405                         break;
2406
2407         case TCP_FIN_WAIT1:
2408         case TCP_FIN_WAIT2:
2409                 /* RFC 793 says to queue data in these states,
2410                  * RFC 1122 says we MUST send a reset.
2411                  * BSD 4.4 also does reset.
2412                  */
2413                 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2414                         if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2415                                 tcp_reset(sk);
2416                                 return 1;
2417                         }
2418                 }
2419
2420         case TCP_ESTABLISHED:
2421                 queued = tcp_data(skb, sk, len);
2422
2423                 /* This must be after tcp_data() does the skb_pull() to
2424                  * remove the header size from skb->len.
2425                  */
2426                 tcp_measure_rcv_mss(sk, skb);
2427                 break;
2428         }
2429
2430         tcp_data_snd_check(sk);
2431         tcp_ack_snd_check(sk);
2432
2433         if (!queued) {
2434 discard:
2435                 kfree_skb(skb);
2436         }
2437         return 0;
2438 }