net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.136 1998/11/07 14:36:18 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <linux/ipsec.h>
  65
  66 #ifdef CONFIG_SYSCTL
  67 #define SYNC_INIT 0 /* let the user enable it */
  68 #else
  69 #define SYNC_INIT 1
  70 #endif
  71
  72 extern int sysctl_tcp_fin_timeout;
  73
  74 /* These are on by default so the code paths get tested.
  75  * For the final 2.2 this may be undone at our discretion. -DaveM
  76  */
  77 int sysctl_tcp_timestamps = 1;
  78 int sysctl_tcp_window_scaling = 1;
  79 int sysctl_tcp_sack = 1;
  80
  81 int sysctl_tcp_syncookies = SYNC_INIT;
  82 int sysctl_tcp_stdurg;
  83 int sysctl_tcp_rfc1337;
  84
  85 static int prune_queue(struct sock *sk);
  86
  87 /* There is something which you must keep in mind when you analyze the
  88  * behavior of the tp->ato delayed ack timeout interval.  When a
  89  * connection starts up, we want to ack as quickly as possible.  The
  90  * problem is that "good" TCP's do slow start at the beginning of data
  91  * transmission.  The means that until we send the first few ACK's the
  92  * sender will sit on his end and only queue most of his data, because
  93  * he can only send snd_cwnd unacked packets at any given time.  For
  94  * each ACK we send, he increments snd_cwnd and transmits more of his
  95  * queue.  -DaveM
  96  */
  97 static void tcp_delack_estimator(struct tcp_opt *tp)
  98 {
  99         if(tp->ato == 0) {
 100                 tp->lrcvtime = jiffies;
 101
 102                 /* Help sender leave slow start quickly,
 103                  * this sets our initial ato value.
 104                  */
 105                 tcp_enter_quickack_mode(tp);
 106         } else {
 107                 int m = jiffies - tp->lrcvtime;
 108
 109                 tp->lrcvtime = jiffies;
 110                 if(m <= 0)
 111                         m = 1;
 112                 if(m > tp->rto)
 113                         tp->ato = tp->rto;
 114                 else
 115                         tp->ato = (tp->ato >> 1) + m;
 116
 117                 /* We are not in "quick ack" mode. */
 118                 if(tp->ato <= (HZ/100))
 119                         tp->ato = ((HZ/100)*2);
 120         }
 121 }
 122
 123 /*
 124  * Remember to send an ACK later.
 125  */
 126 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
 127                                         struct sk_buff *skb)
 128 {
 129         tp->delayed_acks++;
 130         /* Tiny-grams with PSH set make us ACK quickly. */
 131         if(th->psh && (skb->len < (tp->mss_cache >> 1)))
 132                 tp->ato = HZ/50;
 133 }
 134
 135 /* Called to compute a smoothed rtt estimate. The data fed to this
 136  * routine either comes from timestamps, or from segments that were
 137  * known _not_ to have been retransmitted [see Karn/Partridge
 138  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 139  * piece by Van Jacobson.
 140  * NOTE: the next three routines used to be one big routine.
 141  * To save cycles in the RFC 1323 implementation it was better to break
 142  * it up into three procedures. -- erics
 143  */
 144
 145 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 146 {
 147         long m = mrtt; /* RTT */
 148
 149         /*      The following amusing code comes from Jacobson's
 150          *      article in SIGCOMM '88.  Note that rtt and mdev
 151          *      are scaled versions of rtt and mean deviation.
 152          *      This is designed to be as fast as possible
 153          *      m stands for "measurement".
 154          *
 155          *      On a 1990 paper the rto value is changed to:
 156          *      RTO = rtt + 4 * mdev
 157          */
 158         if(m == 0)
 159                 m = 1;
 160         if (tp->srtt != 0) {
 161                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 162                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 163                 if (m < 0)
 164                         m = -m;         /* m is now abs(error) */
 165                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 166                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 167         } else {
 168                 /* no previous measure. */
 169                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 170                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 171         }
 172 }
 173
 174 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 175  * routine referred to above.
 176  */
 177
 178 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 179 {
 180         tp->rto = (tp->srtt >> 3) + tp->mdev;
 181         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 182 }
 183
 184
 185 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 186  * on packet lifetime in the internet. We need the HZ/5 lower
 187  * bound to behave correctly against BSD stacks with a fixed
 188  * delayed ack.
 189  * FIXME: It's not entirely clear this lower bound is the best
 190  * way to avoid the problem. Is it possible to drop the lower
 191  * bound and still avoid trouble with BSD stacks? Perhaps
 192  * some modification to the RTO calculation that takes delayed
 193  * ack bias into account? This needs serious thought. -- erics
 194  */
 195 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 196 {
 197         if (tp->rto > 120*HZ)
 198                 tp->rto = 120*HZ;
 199         if (tp->rto < HZ/5)
 200                 tp->rto = HZ/5;
 201 }
 202
 203 /* WARNING: this must not be called if tp->saw_timestamp was false. */
 204 extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
 205                                              __u32 start_seq, __u32 end_seq)
 206 {
 207         /* From draft-ietf-tcplw-high-performance: the correct
 208          * test is last_ack_sent <= end_seq.
 209          * (RFC1323 stated last_ack_sent < end_seq.)
 210          *
 211          * HOWEVER: The current check contradicts the draft statements.
 212          * It has been done for good reasons.
 213          * The implemented check improves security and eliminates
 214          * unnecessary RTT overestimation.
 215          *              1998/06/27  Andrey V. Savochkin <saw@msu.ru>
 216          */
 217         if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
 218             !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
 219                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 220                  * extra check below makes sure this can only happen
 221                  * for pure ACK frames.  -DaveM
 222                  */
 223                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
 224                         tp->ts_recent = tp->rcv_tsval;
 225                         tp->ts_recent_stamp = jiffies;
 226                 }
 227         }
 228 }
 229
 230 #define PAWS_24DAYS     (HZ * 60 * 60 * 24 * 24)
 231
 232 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 233 {
 234         /* ts_recent must be younger than 24 days */
 235         return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
 236                 (((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
 237                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
 238                  (len != (th->doff * 4))));
 239 }
 240
 241
 242 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 243 {
 244         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 245
 246         if (tp->rcv_wnd &&
 247             after(end_seq, tp->rcv_nxt) &&
 248             before(seq, end_window))
 249                 return 1;
 250         if (seq != end_window)
 251                 return 0;
 252         return (seq == end_seq);
 253 }
 254
 255 /* This functions checks to see if the tcp header is actually acceptable. */
 256 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 257 {
 258         if (seq == tp->rcv_nxt)
 259                 return (tp->rcv_wnd || (end_seq == seq));
 260
 261         return __tcp_sequence(tp, seq, end_seq);
 262 }
 263
 264 /* When we get a reset we do this. */
 265 static void tcp_reset(struct sock *sk, struct sk_buff *skb)
 266 {
 267         sk->zapped = 1;
 268
 269         /* We want the right error as BSD sees it (and indeed as we do). */
 270         switch (sk->state) {
 271                 case TCP_SYN_SENT:
 272                         sk->err = ECONNREFUSED;
 273                         break;
 274                 case TCP_CLOSE_WAIT:
 275                         sk->err = EPIPE;
 276                         break;
 277                 default:
 278                         sk->err = ECONNRESET;
 279         };
 280         tcp_set_state(sk,TCP_CLOSE);
 281         sk->shutdown = SHUTDOWN_MASK;
 282         if (!sk->dead)
 283                 sk->state_change(sk);
 284 }
 285
 286 /* This tags the retransmission queue when SACKs arrive. */
 287 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 288 {
 289         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 290         int i = nsacks;
 291
 292         while(i--) {
 293                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 294                 __u32 start_seq = ntohl(sp->start_seq);
 295                 __u32 end_seq = ntohl(sp->end_seq);
 296                 int fack_count = 0;
 297
 298                 while((skb != NULL) &&
 299                       (skb != tp->send_head) &&
 300                       (skb != (struct sk_buff *)&sk->write_queue)) {
 301                         /* The retransmission queue is always in order, so
 302                          * we can short-circuit the walk early.
 303                          */
 304                         if(!before(start_seq, TCP_SKB_CB(skb)->end_seq))
 305                                 break;
 306
 307                         /* We play conservative, we don't allow SACKS to partially
 308                          * tag a sequence space.
 309                          */
 310                         fack_count++;
 311                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 312                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 313                                 /* If this was a retransmitted frame, account for it. */
 314                                 if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
 315                                         tp->retrans_out--;
 316                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 317
 318                                 /* RULE: All new SACKs will either decrease retrans_out
 319                                  *       or advance fackets_out.
 320                                  */
 321                                 if(fack_count > tp->fackets_out)
 322                                         tp->fackets_out = fack_count;
 323                         }
 324                         skb = skb->next;
 325                 }
 326                 sp++; /* Move on to the next SACK block. */
 327         }
 328 }
 329
 330 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 331  * But, this can also be called on packets in the established flow when
 332  * the fast version below fails.
 333  */
 334 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 335 {
 336         unsigned char *ptr;
 337         int length=(th->doff*4)-sizeof(struct tcphdr);
 338
 339         ptr = (unsigned char *)(th + 1);
 340         tp->saw_tstamp = 0;
 341
 342         while(length>0) {
 343                 int opcode=*ptr++;
 344                 int opsize;
 345
 346                 switch (opcode) {
 347                         case TCPOPT_EOL:
 348                                 return;
 349                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 350                                 length--;
 351                                 continue;
 352                         default:
 353                                 opsize=*ptr++;
 354                                 if (opsize < 2) /* "silly options" */
 355                                         return;
 356                                 if (opsize > length)
 357                                         break;  /* don't parse partial options */
 358                                 switch(opcode) {
 359                                 case TCPOPT_MSS:
 360                                         if(opsize==TCPOLEN_MSS && th->syn) {
 361                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 362                                                 if (in_mss == 0)
 363                                                         in_mss = 536;
 364                                                 if (tp->mss_clamp > in_mss)
 365                                                         tp->mss_clamp = in_mss;
 366                                         }
 367                                         break;
 368                                 case TCPOPT_WINDOW:
 369                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 370                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 371                                                         tp->wscale_ok = 1;
 372                                                         tp->snd_wscale = *(__u8 *)ptr;
 373                                                         if(tp->snd_wscale > 14) {
 374                                                                 if(net_ratelimit())
 375                                                                         printk("tcp_parse_options: Illegal window "
 376                                                                                "scaling value %d >14 received.",
 377                                                                                tp->snd_wscale);
 378                                                                 tp->snd_wscale = 14;
 379                                                         }
 380                                                 }
 381                                         break;
 382                                 case TCPOPT_TIMESTAMP:
 383                                         if(opsize==TCPOLEN_TIMESTAMP) {
 384                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 385                                                         tp->tstamp_ok = 1;
 386                                                         tp->saw_tstamp = 1;
 387                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 388                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 389                                                 }
 390                                         }
 391                                         break;
 392                                 case TCPOPT_SACK_PERM:
 393                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 394                                                 if (sysctl_tcp_sack && !no_fancy) {
 395                                                         tp->sack_ok = 1;
 396                                                         tp->num_sacks = 0;
 397                                                 }
 398                                         }
 399                                         break;
 400
 401                                 case TCPOPT_SACK:
 402                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 403                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 404                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 405
 406                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 407                                                         int num_sacks = sack_bytes >> 3;
 408                                                         struct tcp_sack_block *sackp;
 409
 410                                                         sackp = (struct tcp_sack_block *)ptr;
 411                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 412                                                 }
 413                                         }
 414                                 };
 415                                 ptr+=opsize-2;
 416                                 length-=opsize;
 417                 };
 418         }
 419 }
 420
 421 /* Fast parse options. This hopes to only see timestamps.
 422  * If it is wrong it falls back on tcp_parse_options().
 423  */
 424 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 425 {
 426         /* If we didn't send out any options ignore them all. */
 427         if (tp->tcp_header_len == sizeof(struct tcphdr))
 428                 return 0;
 429         if (th->doff == sizeof(struct tcphdr)>>2) {
 430                 tp->saw_tstamp = 0;
 431                 return 0;
 432         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 433                 __u32 *ptr = (__u32 *)(th + 1);
 434                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 435                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 436                         tp->saw_tstamp = 1;
 437                         tp->rcv_tsval = ntohl(*++ptr);
 438                         tp->rcv_tsecr = ntohl(*++ptr);
 439                         return 1;
 440                 }
 441         }
 442         tcp_parse_options(sk, th, tp, 0);
 443         return 1;
 444 }
 445
 446 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 447 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 448 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 449 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 450
 451 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 452 {
 453         if (tp->dup_acks > 3)
 454                 tp->snd_cwnd = (tp->snd_ssthresh);
 455
 456         tp->dup_acks = 0;
 457 }
 458
 459 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 460  * retransmit timer fires.
 461  */
 462 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 463 {
 464         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 465
 466         /* Note: If not_dup is set this implies we got a
 467          * data carrying packet or a window update.
 468          * This carries no new information about possible
 469          * lost packets, so we have to ignore it for the purposes
 470          * of counting duplicate acks. Ideally this does not imply we
 471          * should stop our fast retransmit phase, more acks may come
 472          * later without data to help us. Unfortunately this would make
 473          * the code below much more complex. For now if I see such
 474          * a packet I clear the fast retransmit phase.
 475          */
 476         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 477                 /* This is the standard reno style fast retransmit branch. */
 478
 479                 /* 1. When the third duplicate ack is received, set ssthresh
 480                  * to one half the current congestion window, but no less
 481                  * than two segments. Retransmit the missing segment.
 482                  */
 483                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 484                         tp->dup_acks++;
 485                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 486                                 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
 487                                 tp->snd_cwnd = (tp->snd_ssthresh + 3);
 488                                 tp->high_seq = tp->snd_nxt;
 489                                 if(!tp->fackets_out)
 490                                         tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
 491                                 else
 492                                         tcp_fack_retransmit(sk);
 493                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 494                         }
 495                 }
 496
 497                 /* 2. Each time another duplicate ACK arrives, increment
 498                  * cwnd by the segment size. [...] Transmit a packet...
 499                  *
 500                  * Packet transmission will be done on normal flow processing
 501                  * since we're not in "retransmit mode".  We do not use duplicate
 502                  * ACKs to artificially inflate the congestion window when
 503                  * doing FACK.
 504                  */
 505                 if (tp->dup_acks > 3) {
 506                         if(!tp->fackets_out) {
 507                                 tp->snd_cwnd++;
 508                         } else {
 509                                 /* Fill any further holes which may have appeared.
 510                                  * We may want to change this to run every further
 511                                  * multiple-of-3 dup ack increments, to be more robust
 512                                  * against out-of-order packet delivery.  -DaveM
 513                                  */
 514                                 tcp_fack_retransmit(sk);
 515                         }
 516                 }
 517         } else if (tp->high_seq != 0) {
 518                 /* In this branch we deal with clearing the Floyd style
 519                  * block on duplicate fast retransmits, and if requested
 520                  * we do Hoe style secondary fast retransmits.
 521                  */
 522                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 523                         /* Once we have acked all the packets up to high_seq
 524                          * we are done this fast retransmit phase.
 525                          * Alternatively data arrived. In this case we
 526                          * Have to abort the fast retransmit attempt.
 527                          * Note that we do want to accept a window
 528                          * update since this is expected with Hoe's algorithm.
 529                          */
 530                         clear_fast_retransmit(tp);
 531
 532                         /* After we have cleared up to high_seq we can
 533                          * clear the Floyd style block.
 534                          */
 535                         if (!before(ack, tp->high_seq)) {
 536                                 tp->high_seq = 0;
 537                                 tp->fackets_out = 0;
 538                         }
 539                 } else if (tp->dup_acks >= 3) {
 540                         if (!tp->fackets_out) {
 541                                 /* Hoe Style. We didn't ack the whole
 542                                  * window. Take this as a cue that
 543                                  * another packet was lost and retransmit it.
 544                                  * Don't muck with the congestion window here.
 545                                  * Note that we have to be careful not to
 546                                  * act if this was a window update and it
 547                                  * didn't ack new data, since this does
 548                                  * not indicate a packet left the system.
 549                                  * We can test this by just checking
 550                                  * if ack changed from snd_una, since
 551                                  * the only way to get here without advancing
 552                                  * from snd_una is if this was a window update.
 553                                  */
 554                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 555                                         tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
 556                                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 557                                 }
 558                         } else {
 559                                 /* FACK style, fill any remaining holes in
 560                                  * receiver's queue.
 561                                  */
 562                                 tcp_fack_retransmit(sk);
 563                         }
 564                 }
 565         }
 566 }
 567
 568 /* This is Jacobson's slow start and congestion avoidance.
 569  * SIGCOMM '88, p. 328.
 570  */
 571 static void tcp_cong_avoid(struct tcp_opt *tp)
 572 {
 573         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 574                 /* In "safe" area, increase. */
 575                 tp->snd_cwnd++;
 576         } else {
 577                 /* In dangerous area, increase slowly.
 578                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 579                  */
 580                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 581                         tp->snd_cwnd++;
 582                         tp->snd_cwnd_cnt=0;
 583                 } else
 584                         tp->snd_cwnd_cnt++;
 585         }
 586 }
 587
 588 /* Remove acknowledged frames from the retransmission queue. */
 589 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 590                                __u32 *seq, __u32 *seq_rtt)
 591 {
 592         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 593         struct sk_buff *skb;
 594         unsigned long now = jiffies;
 595         int acked = 0;
 596
 597         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 598                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 599                 __u8 sacked = scb->sacked;
 600
 601                 /* If our packet is before the ack sequence we can
 602                  * discard it as it's confirmed to have arrived at
 603                  * the other end.
 604                  */
 605                 if (after(scb->end_seq, ack))
 606                         break;
 607
 608                 /* Initial outgoing SYN's get put onto the write_queue
 609                  * just like anything else we transmit.  It is not
 610                  * true data, and if we misinform our callers that
 611                  * this ACK acks real data, we will erroneously exit
 612                  * connection startup slow start one packet too
 613                  * quickly.  This is severely frowned upon behavior.
 614                  */
 615                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 616                         tp->retrans_out--;
 617                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 618                         acked |= FLAG_DATA_ACKED;
 619                         if(sacked & TCPCB_SACKED_RETRANS)
 620                                 acked |= FLAG_RETRANS_DATA_ACKED;
 621                         if(tp->fackets_out)
 622                                 tp->fackets_out--;
 623                 } else {
 624                         tp->retrans_head = NULL;
 625                 }
 626                 tp->packets_out--;
 627                 *seq = scb->seq;
 628                 *seq_rtt = now - scb->when;
 629                 __skb_unlink(skb, skb->list);
 630                 kfree_skb(skb);
 631         }
 632
 633         if (acked)
 634                 tp->retrans_head = NULL;
 635         return acked;
 636 }
 637
 638 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 639 {
 640         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 641
 642         /* Our probe was answered. */
 643         tp->probes_out = 0;
 644
 645         /* Was it a usable window open? */
 646
 647         /* should always be non-null */
 648         if (tp->send_head != NULL &&
 649             !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 650                 tp->backoff = 0;
 651                 tp->pending = 0;
 652                 tcp_clear_xmit_timer(sk, TIME_PROBE0);
 653         } else {
 654                 tcp_reset_xmit_timer(sk, TIME_PROBE0,
 655                                      min(tp->rto << tp->backoff, 120*HZ));
 656         }
 657 }
 658
 659 /* Read draft-ietf-tcplw-high-performance before mucking
 660  * with this code. (Superceeds RFC1323)
 661  */
 662 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 663                                u32 seq, u32 ack, int flag)
 664 {
 665         __u32 seq_rtt;
 666
 667         /* RTTM Rule: A TSecr value received in a segment is used to
 668          * update the averaged RTT measurement only if the segment
 669          * acknowledges some new data, i.e., only if it advances the
 670          * left edge of the send window.
 671          *
 672          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 673          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 674          */
 675         if (!(flag & FLAG_DATA_ACKED))
 676                 return;
 677
 678         seq_rtt = jiffies-tp->rcv_tsecr;
 679         tcp_rtt_estimator(tp, seq_rtt);
 680         if (tp->retransmits) {
 681                 if (tp->packets_out == 0) {
 682                         tp->retransmits = 0;
 683                         tp->fackets_out = 0;
 684                         tp->retrans_out = 0;
 685                         tp->backoff = 0;
 686                         tcp_set_rto(tp);
 687                 } else {
 688                         /* Still retransmitting, use backoff */
 689                         tcp_set_rto(tp);
 690                         tp->rto = tp->rto << tp->backoff;
 691                 }
 692         } else {
 693                 tcp_set_rto(tp);
 694                 tcp_cong_avoid(tp);
 695         }
 696         /* NOTE: safe here so long as cong_ctl doesn't use rto */
 697         tcp_bound_rto(tp);
 698 }
 699
 700 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 701 {
 702         struct sk_buff *skb = skb_peek(&sk->write_queue);
 703         long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
 704
 705         /* Some data was ACK'd, if still retransmitting (due to a
 706          * timeout), resend more of the retransmit queue.  The
 707          * congestion window is handled properly by that code.
 708          */
 709         if (tp->retransmits) {
 710                 tp->retrans_head = NULL;
 711                 tcp_xmit_retransmit_queue(sk);
 712                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 713         } else {
 714                 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 715         }
 716 }
 717
 718 /* This routine deals with incoming acks, but not outgoing ones. */
 719 static int tcp_ack(struct sock *sk, struct tcphdr *th,
 720                    u32 ack_seq, u32 ack, int len)
 721 {
 722         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 723         int flag = 0;
 724         u32 seq = 0;
 725         u32 seq_rtt = 0;
 726
 727         if(sk->zapped)
 728                 return(1);      /* Dead, can't ack any more so why bother */
 729
 730         if (tp->pending == TIME_KEEPOPEN)
 731                 tp->probes_out = 0;
 732
 733         tp->rcv_tstamp = jiffies;
 734
 735         /* If the ack is newer than sent or older than previous acks
 736          * then we can probably ignore it.
 737          */
 738         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 739                 goto uninteresting_ack;
 740
 741         dst_confirm(sk->dst_cache);
 742
 743         /* If there is data set flag 1 */
 744         if (len != th->doff*4) {
 745                 flag |= FLAG_DATA;
 746                 tcp_delack_estimator(tp);
 747         }
 748
 749         /* Update our send window. */
 750
 751         /* This is the window update code as per RFC 793
 752          * snd_wl{1,2} are used to prevent unordered
 753          * segments from shrinking the window
 754          */
 755         if (before(tp->snd_wl1, ack_seq) ||
 756             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 757                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
 758
 759                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 760                         flag |= FLAG_WIN_UPDATE;
 761                         tp->snd_wnd = nwin;
 762
 763                         tp->snd_wl1 = ack_seq;
 764                         tp->snd_wl2 = ack;
 765
 766                         if (nwin > tp->max_window)
 767                                 tp->max_window = nwin;
 768                 }
 769         }
 770
 771         /* We passed data and got it acked, remove any soft error
 772          * log. Something worked...
 773          */
 774         sk->err_soft = 0;
 775
 776         /* If this ack opens up a zero window, clear backoff.  It was
 777          * being used to time the probes, and is probably far higher than
 778          * it needs to be for normal retransmission.
 779          */
 780         if (tp->pending == TIME_PROBE0)
 781                 tcp_ack_probe(sk, ack);
 782
 783         /* See if we can take anything off of the retransmit queue. */
 784         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 785
 786         /* If we have a timestamp, we always do rtt estimates. */
 787         if (tp->saw_tstamp) {
 788                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
 789         } else {
 790                 /* If we were retransmiting don't count rtt estimate. */
 791                 if (tp->retransmits) {
 792                         if (tp->packets_out == 0) {
 793                                 tp->retransmits = 0;
 794                                 tp->fackets_out = 0;
 795                                 tp->retrans_out = 0;
 796                         }
 797                 } else {
 798                         /* We don't have a timestamp. Can only use
 799                          * packets that are not retransmitted to determine
 800                          * rtt estimates. Also, we must not reset the
 801                          * backoff for rto until we get a non-retransmitted
 802                          * packet. This allows us to deal with a situation
 803                          * where the network delay has increased suddenly.
 804                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 805                          */
 806                         if (flag & FLAG_DATA_ACKED) {
 807                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
 808                                         tp->backoff = 0;
 809                                         tcp_rtt_estimator(tp, seq_rtt);
 810                                         tcp_set_rto(tp);
 811                                         tcp_bound_rto(tp);
 812                                 }
 813                                 tcp_cong_avoid(tp);
 814                         }
 815                 }
 816         }
 817
 818         if (tp->packets_out) {
 819                 if (flag & FLAG_DATA_ACKED)
 820                         tcp_ack_packets_out(sk, tp);
 821         } else {
 822                 tcp_clear_xmit_timer(sk, TIME_RETRANS);
 823         }
 824
 825         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
 826         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
 827             (tp->high_seq != 0)) {
 828                 tcp_fast_retrans(sk, ack, flag);
 829         } else {
 830                 /* Clear any aborted fast retransmit starts. */
 831                 tp->dup_acks = 0;
 832         }
 833         /* Remember the highest ack received. */
 834         tp->snd_una = ack;
 835         return 1;
 836
 837 uninteresting_ack:
 838         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 839         return 0;
 840 }
 841
 842 /* New-style handling of TIME_WAIT sockets. */
 843 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
 844 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
 845 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
 846
 847 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 848 {
 849         /* Unlink from various places. */
 850         if(tw->bind_next)
 851                 tw->bind_next->bind_pprev = tw->bind_pprev;
 852         *(tw->bind_pprev) = tw->bind_next;
 853         if(tw->tb->owners == NULL)
 854                 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 855
 856         if(tw->next)
 857                 tw->next->pprev = tw->pprev;
 858         *tw->pprev = tw->next;
 859
 860         /* We decremented the prot->inuse count when we entered TIME_WAIT
 861          * and the sock from which this came was destroyed.
 862          */
 863         tw->sklist_next->sklist_prev = tw->sklist_prev;
 864         tw->sklist_prev->sklist_next = tw->sklist_next;
 865
 866         /* Ok, now free it up. */
 867         kmem_cache_free(tcp_timewait_cachep, tw);
 868 }
 869
 870 /* We come here as a special case from the AF specific TCP input processing,
 871  * and the SKB has no owner.  Essentially handling this is very simple,
 872  * we just keep silently eating rx'd packets until none show up for the
 873  * entire timeout period.  The only special cases are for BSD TIME_WAIT
 874  * reconnects and SYN/RST bits being set in the TCP header.
 875  */
 876 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 877                                struct tcphdr *th, unsigned len)
 878 {
 879         /*      RFC 1122:
 880          *      "When a connection is [...] on TIME-WAIT state [...]
 881          *      [a TCP] MAY accept a new SYN from the remote TCP to
 882          *      reopen the connection directly, if it:
 883          *
 884          *      (1)  assigns its initial sequence number for the new
 885          *      connection to be larger than the largest sequence
 886          *      number it used on the previous connection incarnation,
 887          *      and
 888          *
 889          *      (2)  returns to TIME-WAIT state if the SYN turns out
 890          *      to be an old duplicate".
 891          */
 892         if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
 893                 struct sock *sk;
 894                 struct tcp_func *af_specific = tw->af_specific;
 895                 __u32 isn;
 896
 897                 isn = tw->rcv_nxt + 128000;
 898                 if(isn == 0)
 899                         isn++;
 900                 tcp_tw_deschedule(tw);
 901                 tcp_timewait_kill(tw);
 902                 sk = af_specific->get_sock(skb, th);
 903                 if(sk == NULL || !ipsec_sk_policy(sk,skb))
 904                         return 0;
 905                 skb_set_owner_r(skb, sk);
 906                 af_specific = sk->tp_pinfo.af_tcp.af_specific;
 907                 if(af_specific->conn_request(sk, skb, isn) < 0)
 908                         return 1; /* Toss a reset back. */
 909                 return 0; /* Discard the frame. */
 910         }
 911
 912         /* Check RST or SYN */
 913         if(th->rst || th->syn) {
 914                 /* This is TIME_WAIT assasination, in two flavors.
 915                  * Oh well... nobody has a sufficient solution to this
 916                  * protocol bug yet.
 917                  */
 918                 if(sysctl_tcp_rfc1337 == 0) {
 919                         tcp_tw_deschedule(tw);
 920                         tcp_timewait_kill(tw);
 921                 }
 922                 if(!th->rst)
 923                         return 1; /* toss a reset back */
 924         } else {
 925                 /* In this case we must reset the TIMEWAIT timer. */
 926                 if(th->ack)
 927                         tcp_tw_reschedule(tw);
 928         }
 929         return 0; /* Discard the frame. */
 930 }
 931
 932 /* Enter the time wait state.  This is always called from BH
 933  * context.  Essentially we whip up a timewait bucket, copy the
 934  * relevant info into it from the SK, and mess with hash chains
 935  * and list linkage.
 936  */
 937 static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 938 {
 939         struct sock **head, *sktw;
 940
 941         /* Step 1: Remove SK from established hash. */
 942         if(sk->next)
 943                 sk->next->pprev = sk->pprev;
 944         *sk->pprev = sk->next;
 945         sk->pprev = NULL;
 946         tcp_reg_zap(sk);
 947
 948         /* Step 2: Put TW into bind hash where SK was. */
 949         tw->tb = (struct tcp_bind_bucket *)sk->prev;
 950         if((tw->bind_next = sk->bind_next) != NULL)
 951                 sk->bind_next->bind_pprev = &tw->bind_next;
 952         tw->bind_pprev = sk->bind_pprev;
 953         *sk->bind_pprev = (struct sock *)tw;
 954
 955         /* Step 3: Same for the protocol sklist. */
 956         (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
 957         (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
 958         sk->sklist_next = NULL;
 959         sk->prot->inuse--;
 960
 961         /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
 962         head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
 963         sktw = (struct sock *)tw;
 964         if((sktw->next = *head) != NULL)
 965                 (*head)->pprev = &sktw->next;
 966         *head = sktw;
 967         sktw->pprev = head;
 968 }
 969
 970 void tcp_time_wait(struct sock *sk)
 971 {
 972         struct tcp_tw_bucket *tw;
 973
 974         tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 975         if(tw != NULL) {
 976                 /* Give us an identity. */
 977                 tw->daddr       = sk->daddr;
 978                 tw->rcv_saddr   = sk->rcv_saddr;
 979                 tw->bound_dev_if= sk->bound_dev_if;
 980                 tw->num         = sk->num;
 981                 tw->state       = TCP_TIME_WAIT;
 982                 tw->sport       = sk->sport;
 983                 tw->dport       = sk->dport;
 984                 tw->family      = sk->family;
 985                 tw->reuse       = sk->reuse;
 986                 tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
 987                 tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
 988
 989 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 990                 if(tw->family == PF_INET6) {
 991                         memcpy(&tw->v6_daddr,
 992                                &sk->net_pinfo.af_inet6.daddr,
 993                                sizeof(struct in6_addr));
 994                         memcpy(&tw->v6_rcv_saddr,
 995                                &sk->net_pinfo.af_inet6.rcv_saddr,
 996                                sizeof(struct in6_addr));
 997                 }
 998 #endif
 999                 /* Linkage updates. */
1000                 tcp_tw_hashdance(sk, tw);
1001
1002                 /* Get the TIME_WAIT timeout firing. */
1003                 tcp_tw_schedule(tw);
1004
1005                 /* CLOSE the SK. */
1006                 if(sk->state == TCP_ESTABLISHED)
1007                         tcp_statistics.TcpCurrEstab--;
1008                 sk->state = TCP_CLOSE;
1009                 net_reset_timer(sk, TIME_DONE,
1010                                 min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1011         } else {
1012                 /* Sorry, we're out of memory, just CLOSE this
1013                  * socket up.  We've got bigger problems than
1014                  * non-graceful socket closings.
1015                  */
1016                 tcp_set_state(sk, TCP_CLOSE);
1017         }
1018
1019         /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1020         sk->shutdown = SHUTDOWN_MASK;
1021         if(!sk->dead)
1022                 sk->state_change(sk);
1023 }
1024
1025 /*
1026  *      Process the FIN bit. This now behaves as it is supposed to work
1027  *      and the FIN takes effect when it is validly part of sequence
1028  *      space. Not before when we get holes.
1029  *
1030  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1031  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1032  *      TIME-WAIT)
1033  *
1034  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1035  *      close and we go into CLOSING (and later onto TIME-WAIT)
1036  *
1037  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1038  */
1039
1040 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1041 {
1042         sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1043
1044         tcp_send_ack(sk);
1045
1046         if (!sk->dead) {
1047                 sk->state_change(sk);
1048                 sock_wake_async(sk->socket, 1);
1049         }
1050
1051         switch(sk->state) {
1052                 case TCP_SYN_RECV:
1053                 case TCP_ESTABLISHED:
1054                         /* Move to CLOSE_WAIT */
1055                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1056                         if (th->rst)
1057                                 sk->shutdown = SHUTDOWN_MASK;
1058                         break;
1059
1060                 case TCP_CLOSE_WAIT:
1061                 case TCP_CLOSING:
1062                         /* Received a retransmission of the FIN, do
1063                          * nothing.
1064                          */
1065                         break;
1066                 case TCP_LAST_ACK:
1067                         /* RFC793: Remain in the LAST-ACK state. */
1068                         break;
1069
1070                 case TCP_FIN_WAIT1:
1071                         /* This case occurs when a simultaneous close
1072                          * happens, we must ack the received FIN and
1073                          * enter the CLOSING state.
1074                          *
1075                          * This causes a WRITE timeout, which will either
1076                          * move on to TIME_WAIT when we timeout, or resend
1077                          * the FIN properly (maybe we get rid of that annoying
1078                          * FIN lost hang). The TIME_WRITE code is already
1079                          * correct for handling this timeout.
1080                          */
1081                         tcp_set_state(sk, TCP_CLOSING);
1082                         break;
1083                 case TCP_FIN_WAIT2:
1084                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1085                         tcp_time_wait(sk);
1086                         break;
1087                 default:
1088                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1089                          * cases we should never reach this piece of code.
1090                          */
1091                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1092                         break;
1093         };
1094 }
1095
1096 /* These routines update the SACK block as out-of-order packets arrive or
1097  * in-order packets close up the sequence space.
1098  */
1099 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1100 {
1101         int this_sack, num_sacks = tp->num_sacks;
1102         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1103
1104         /* If more than one SACK block, see if the recent change to SP eats into
1105          * or hits the sequence space of other SACK blocks, if so coalesce.
1106          */
1107         if(num_sacks != 1) {
1108                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1109                         if(swalk == sp)
1110                                 continue;
1111
1112                         /* First case, bottom of SP moves into top of the
1113                          * sequence space of SWALK.
1114                          */
1115                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1116                                 sp->start_seq = swalk->start_seq;
1117                                 goto coalesce;
1118                         }
1119                         /* Second case, top of SP moves into bottom of the
1120                          * sequence space of SWALK.
1121                          */
1122                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1123                                 sp->end_seq = swalk->end_seq;
1124                                 goto coalesce;
1125                         }
1126                 }
1127         }
1128         /* SP is the only SACK, or no coalescing cases found. */
1129         return;
1130
1131 coalesce:
1132         /* Zap SWALK, by moving every further SACK up by one slot.
1133          * Decrease num_sacks.
1134          */
1135         for(this_sack += 1; this_sack < num_sacks-1; this_sack++, swalk++) {
1136                 struct tcp_sack_block *next = (swalk + 1);
1137                 swalk->start_seq = next->start_seq;
1138                 swalk->end_seq = next->end_seq;
1139         }
1140         tp->num_sacks--;
1141 }
1142
1143 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1144 {
1145         __u32 tmp;
1146
1147         tmp = sack1->start_seq;
1148         sack1->start_seq = sack2->start_seq;
1149         sack2->start_seq = tmp;
1150
1151         tmp = sack1->end_seq;
1152         sack1->end_seq = sack2->end_seq;
1153         sack2->end_seq = tmp;
1154 }
1155
1156 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1157 {
1158         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1159         struct tcp_sack_block *sp = &tp->selective_acks[0];
1160         int cur_sacks = tp->num_sacks;
1161
1162         if (!cur_sacks)
1163                 goto new_sack;
1164
1165         /* Optimize for the common case, new ofo frames arrive
1166          * "in order". ;-)  This also satisfies the requirements
1167          * of RFC2018 about ordering of SACKs.
1168          */
1169         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1170                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1171                 tcp_sack_maybe_coalesce(tp, sp);
1172         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1173                 /* Re-ordered arrival, in this case, can be optimized
1174                  * as well.
1175                  */
1176                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1177                 tcp_sack_maybe_coalesce(tp, sp);
1178         } else {
1179                 struct tcp_sack_block *swap = sp + 1;
1180                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1181
1182                 /* Oh well, we have to move things around.
1183                  * Try to find a SACK we can tack this onto.
1184                  */
1185
1186                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1187                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1188                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1189                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1190                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1191                                 else
1192                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1193                                 tcp_sack_swap(sp, swap);
1194                                 tcp_sack_maybe_coalesce(tp, sp);
1195                                 return;
1196                         }
1197                 }
1198
1199                 /* Could not find an adjacent existing SACK, build a new one,
1200                  * put it at the front, and shift everyone else down.  We
1201                  * always know there is at least one SACK present already here.
1202                  *
1203                  * If the sack array is full, forget about the last one.
1204                  */
1205                 if (cur_sacks >= max_sacks) {
1206                         cur_sacks--;
1207                         tp->num_sacks--;
1208                 }
1209                 while(cur_sacks >= 1) {
1210                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1211                         struct tcp_sack_block *prev = (this - 1);
1212                         this->start_seq = prev->start_seq;
1213                         this->end_seq = prev->end_seq;
1214                         cur_sacks--;
1215                 }
1216
1217         new_sack:
1218                 /* Build the new head SACK, and we're done. */
1219                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1220                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1221                 tp->num_sacks++;
1222         }
1223 }
1224
1225 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1226 {
1227         struct tcp_sack_block *sp = &tp->selective_acks[0];
1228         int num_sacks = tp->num_sacks;
1229         int this_sack;
1230
1231         /* This is an in order data segment _or_ an out-of-order SKB being
1232          * moved to the receive queue, so we know this removed SKB will eat
1233          * from the front of a SACK.
1234          */
1235         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1236                 /* Check if the start of the sack is covered by skb. */
1237                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1238                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1239                         break;
1240         }
1241
1242         /* This should only happen if so many SACKs get built that some get
1243          * pushed out before we get here, or we eat some in sequence packets
1244          * which are before the first SACK block.
1245          */
1246         if(this_sack >= num_sacks)
1247                 return;
1248
1249         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1250         if(!before(sp->start_seq, sp->end_seq)) {
1251                 /* Zap this SACK, by moving forward any other SACKS. */
1252                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1253                         struct tcp_sack_block *next = (sp + 1);
1254                         sp->start_seq = next->start_seq;
1255                         sp->end_seq = next->end_seq;
1256                 }
1257                 tp->num_sacks--;
1258         }
1259 }
1260
1261 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1262 {
1263         struct tcp_sack_block *sp = &tp->selective_acks[0];
1264         int num_sacks = tp->num_sacks;
1265         int this_sack;
1266
1267         for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) {
1268                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1269                         break;
1270         }
1271         if(this_sack >= num_sacks)
1272                 return;
1273         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1274 }
1275
1276 /* This one checks to see if we can put data from the
1277  * out_of_order queue into the receive_queue.
1278  */
1279 static void tcp_ofo_queue(struct sock *sk)
1280 {
1281         struct sk_buff *skb;
1282         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1283
1284         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1285                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1286                         break;
1287
1288                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1289                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1290                         __skb_unlink(skb, skb->list);
1291                         kfree_skb(skb);
1292                         continue;
1293                 }
1294                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1295                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1296                            TCP_SKB_CB(skb)->end_seq);
1297
1298                 if(tp->sack_ok)
1299                         tcp_sack_remove_skb(tp, skb);
1300                 __skb_unlink(skb, skb->list);
1301                 __skb_queue_tail(&sk->receive_queue, skb);
1302                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1303                 if(skb->h.th->fin)
1304                         tcp_fin(skb, sk, skb->h.th);
1305         }
1306 }
1307
1308 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1309 {
1310         struct sk_buff *skb1;
1311         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1312
1313         /*  Queue data for delivery to the user.
1314          *  Packets in sequence go to the receive queue.
1315          *  Out of sequence packets to out_of_order_queue.
1316          */
1317         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1318                 /* Ok. In sequence. */
1319         queue_and_out:
1320                 dst_confirm(sk->dst_cache);
1321                 __skb_queue_tail(&sk->receive_queue, skb);
1322                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1323                 if(skb->h.th->fin) {
1324                         tcp_fin(skb, sk, skb->h.th);
1325                 } else {
1326                         tcp_remember_ack(tp, skb->h.th, skb);
1327                 }
1328                 /* This may have eaten into a SACK block. */
1329                 if(tp->sack_ok && tp->num_sacks)
1330                         tcp_sack_remove_skb(tp, skb);
1331                 tcp_ofo_queue(sk);
1332
1333                 /* Turn on fast path. */
1334                 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1335                         tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1336                                                (0x10 << 16) |
1337                                                tp->snd_wnd);
1338                 return;
1339         }
1340
1341         /* An old packet, either a retransmit or some packet got lost. */
1342         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1343                 /* A retransmit, 2nd most common case.  Force an imediate ack. */
1344                 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1345                 tcp_enter_quickack_mode(tp);
1346                 kfree_skb(skb);
1347                 return;
1348         }
1349
1350         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1351                 /* Partial packet, seq < rcv_next < end_seq */
1352                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1353                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1354                            TCP_SKB_CB(skb)->end_seq);
1355
1356                 goto queue_and_out;
1357         }
1358
1359         /* Ok. This is an out_of_order segment, force an ack. */
1360         tp->delayed_acks++;
1361         tcp_enter_quickack_mode(tp);
1362
1363         /* Disable header predition. */
1364         tp->pred_flags = 0;
1365
1366         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1367                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1368
1369         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1370                 /* Initial out of order segment, build 1 SACK. */
1371                 if(tp->sack_ok) {
1372                         tp->num_sacks = 1;
1373                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1374                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1375                 }
1376                 __skb_queue_head(&tp->out_of_order_queue,skb);
1377         } else {
1378                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1379                         /* Already there. */
1380                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1381                                 if (skb->len >= skb1->len) {
1382                                         if(tp->sack_ok)
1383                                                 tcp_sack_extend(tp, skb1, skb);
1384                                         __skb_append(skb1, skb);
1385                                         __skb_unlink(skb1, skb1->list);
1386                                         kfree_skb(skb1);
1387                                 } else {
1388                                         /* A duplicate, smaller than what is in the
1389                                          * out-of-order queue right now, toss it.
1390                                          */
1391                                         kfree_skb(skb);
1392                                 }
1393                                 break;
1394                         }
1395
1396                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1397                                 __skb_append(skb1, skb);
1398                                 if(tp->sack_ok)
1399                                         tcp_sack_new_ofo_skb(sk, skb);
1400                                 break;
1401                         }
1402
1403                         /* See if we've hit the start. If so insert. */
1404                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1405                                 __skb_queue_head(&tp->out_of_order_queue,skb);
1406                                 if(tp->sack_ok)
1407                                         tcp_sack_new_ofo_skb(sk, skb);
1408                                 break;
1409                         }
1410                 }
1411         }
1412 }
1413
1414
1415 /*
1416  *      This routine handles the data.  If there is room in the buffer,
1417  *      it will be have already been moved into it.  If there is no
1418  *      room, then we will just have to discard the packet.
1419  */
1420
1421 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1422 {
1423         struct tcphdr *th;
1424         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1425
1426         th = skb->h.th;
1427         skb_pull(skb, th->doff*4);
1428         skb_trim(skb, len - (th->doff*4));
1429
1430         if (skb->len == 0 && !th->fin)
1431                 return(0);
1432
1433         /*
1434          *      If our receive queue has grown past its limits shrink it.
1435          *      Make sure to do this before moving snd_nxt, otherwise
1436          *      data might be acked for that we don't have enough room.
1437          */
1438         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1439                 if (prune_queue(sk) < 0) {
1440                         /* Still not enough room. That can happen when
1441                          * skb->true_size differs significantly from skb->len.
1442                          */
1443                         return 0;
1444                 }
1445         }
1446
1447         tcp_data_queue(sk, skb);
1448
1449         if (before(tp->rcv_nxt, tp->copied_seq)) {
1450                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1451                 tp->rcv_nxt = tp->copied_seq;
1452         }
1453
1454         /* Above, tcp_data_queue() increments delayed_acks appropriately.
1455          * Now tell the user we may have some data.
1456          */
1457         if (!sk->dead) {
1458                 SOCK_DEBUG(sk, "Data wakeup.\n");
1459                 sk->data_ready(sk,0);
1460         }
1461         return(1);
1462 }
1463
1464 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1465 {
1466         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1467
1468         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1469             tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1470                 /* Put more data onto the wire. */
1471                 tcp_write_xmit(sk);
1472         } else if (tp->packets_out == 0 && !tp->pending) {
1473                 /* Start probing the receivers window. */
1474                 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1475         }
1476 }
1477
1478 static __inline__ void tcp_data_snd_check(struct sock *sk)
1479 {
1480         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1481
1482         if (skb != NULL)
1483                 __tcp_data_snd_check(sk, skb);
1484 }
1485
1486 /*
1487  * Adapt the MSS value used to make delayed ack decision to the
1488  * real world.
1489  */
1490 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1491 {
1492         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1493         unsigned int len = skb->len, lss;
1494
1495         if (len > tp->rcv_mss)
1496                 tp->rcv_mss = len;
1497         lss = tp->last_seg_size;
1498         tp->last_seg_size = 0;
1499         if (len >= 536) {
1500                 if (len == lss)
1501                         tp->rcv_mss = len;
1502                 tp->last_seg_size = len;
1503         }
1504 }
1505
1506 /*
1507  * Check if sending an ack is needed.
1508  */
1509 static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1510 {
1511         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1512
1513         /* This also takes care of updating the window.
1514          * This if statement needs to be simplified.
1515          *
1516          * Rules for delaying an ack:
1517          *      - delay time <= 0.5 HZ
1518          *      - we don't have a window update to send
1519          *      - must send at least every 2 full sized packets
1520          *      - must send an ACK if we have any out of order data
1521          *
1522          * With an extra heuristic to handle loss of packet
1523          * situations and also helping the sender leave slow
1524          * start in an expediant manner.
1525          */
1526
1527             /* Two full frames received or... */
1528         if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1529             /* We will update the window "significantly" or... */
1530             tcp_raise_window(sk) ||
1531             /* We entered "quick ACK" mode or... */
1532             tcp_in_quickack_mode(tp) ||
1533             /* We have out of order data */
1534             (skb_peek(&tp->out_of_order_queue) != NULL)) {
1535                 /* Then ack it now */
1536                 tcp_send_ack(sk);
1537         } else {
1538                 /* Else, send delayed ack. */
1539                 tcp_send_delayed_ack(tp, HZ/2);
1540         }
1541 }
1542
1543 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1544 {
1545         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1546         if (tp->delayed_acks == 0) {
1547                 /* We sent a data segment already. */
1548                 return;
1549         }
1550         __tcp_ack_snd_check(sk);
1551 }
1552
1553
1554 /*
1555  *      This routine is only called when we have urgent data
1556  *      signalled. Its the 'slow' part of tcp_urg. It could be
1557  *      moved inline now as tcp_urg is only called from one
1558  *      place. We handle URGent data wrong. We have to - as
1559  *      BSD still doesn't use the correction from RFC961.
1560  *      For 1003.1g we should support a new option TCP_STDURG to permit
1561  *      either form (or just set the sysctl tcp_stdurg).
1562  */
1563
1564 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1565 {
1566         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1567         u32 ptr = ntohs(th->urg_ptr);
1568
1569         if (ptr && !sysctl_tcp_stdurg)
1570                 ptr--;
1571         ptr += ntohl(th->seq);
1572
1573         /* Ignore urgent data that we've already seen and read. */
1574         if (after(tp->copied_seq, ptr))
1575                 return;
1576
1577         /* Do we already have a newer (or duplicate) urgent pointer? */
1578         if (tp->urg_data && !after(ptr, tp->urg_seq))
1579                 return;
1580
1581         /* Tell the world about our new urgent pointer. */
1582         if (sk->proc != 0) {
1583                 if (sk->proc > 0)
1584                         kill_proc(sk->proc, SIGURG, 1);
1585                 else
1586                         kill_pg(-sk->proc, SIGURG, 1);
1587         }
1588
1589         /* We may be adding urgent data when the last byte read was
1590          * urgent. To do this requires some care. We cannot just ignore
1591          * tp->copied_seq since we would read the last urgent byte again
1592          * as data, nor can we alter copied_seq until this data arrives
1593          * or we break the sematics of SIOCATMARK (and thus sockatmark())
1594          */
1595         if (tp->urg_seq == tp->copied_seq)
1596                 tp->copied_seq++;       /* Move the copied sequence on correctly */
1597         tp->urg_data = URG_NOTYET;
1598         tp->urg_seq = ptr;
1599
1600         /* Disable header prediction. */
1601         tp->pred_flags = 0;
1602 }
1603
1604 /* This is the 'fast' part of urgent handling. */
1605 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1606 {
1607         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1608
1609         /* Check if we get a new urgent pointer - normally not. */
1610         if (th->urg)
1611                 tcp_check_urg(sk,th);
1612
1613         /* Do we wait for any urgent data? - normally not... */
1614         if (tp->urg_data == URG_NOTYET) {
1615                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1616
1617                 /* Is the urgent pointer pointing into this packet? */
1618                 if (ptr < len) {
1619                         tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1620                         if (!sk->dead)
1621                                 sk->data_ready(sk,0);
1622                 }
1623         }
1624 }
1625
1626 /*
1627  * Clean first the out_of_order queue, then the receive queue until
1628  * the socket is in its memory limits again.
1629  */
1630 static int prune_queue(struct sock *sk)
1631 {
1632         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1633         struct sk_buff * skb;
1634
1635         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1636
1637         net_statistics.PruneCalled++;
1638
1639         /* First Clean the out_of_order queue. */
1640         /* Start with the end because there are probably the least
1641          * useful packets (crossing fingers).
1642          */
1643         while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) {
1644                 net_statistics.OfoPruned += skb->len;
1645                 kfree_skb(skb);
1646                 if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
1647                         return 0;
1648         }
1649
1650         /* Now continue with the receive queue if it wasn't enough.
1651          * But only do this if we are really being abused.
1652          */
1653         while ((atomic_read(&sk->rmem_alloc) >= (sk->rcvbuf * 2)) &&
1654                (skb = skb_peek_tail(&sk->receive_queue))) {
1655                 /* Never toss anything when we've seen the FIN.
1656                  * It's just too complex to recover from it.
1657                  */
1658                 if(skb->h.th->fin)
1659                         break;
1660
1661                 /* Never remove packets that have been already acked */
1662                 if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) {
1663                         SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n",
1664                                    tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
1665                                    tp->last_ack_sent);
1666                         return -1;
1667                 }
1668
1669                 net_statistics.RcvPruned += skb->len;
1670
1671                 __skb_unlink(skb, skb->list);
1672                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
1673                 SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
1674                            TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1675                            tp->copied_seq);
1676                 kfree_skb(skb);
1677         }
1678         return 0;
1679 }
1680
1681 /*
1682  *      TCP receive function for the ESTABLISHED state.
1683  *
1684  *      It is split into a fast path and a slow path. The fast path is
1685  *      disabled when:
1686  *      - A zero window was announced from us - zero window probing
1687  *        is only handled properly in the slow path.
1688  *      - Out of order segments arrived.
1689  *      - Urgent data is expected.
1690  *      - There is no buffer space left
1691  *      - Unexpected TCP flags/window values/header lengths are received
1692  *        (detected by checking the TCP header against pred_flags)
1693  *      - Data is sent in both directions. Fast path only supports pure senders
1694  *        or pure receivers (this means either the sequence number or the ack
1695  *        value must stay constant)
1696  *
1697  *      When these conditions are not satisfied it drops into a standard
1698  *      receive procedure patterned after RFC793 to handle all cases.
1699  *      The first three cases are guaranteed by proper pred_flags setting,
1700  *      the rest is checked inline. Fast processing is turned on in
1701  *      tcp_data_queue when everything is OK.
1702  */
1703 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1704                         struct tcphdr *th, unsigned len)
1705 {
1706         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1707         int queued;
1708         u32 flg;
1709
1710         /*
1711          *      Header prediction.
1712          *      The code follows the one in the famous
1713          *      "30 instruction TCP receive" Van Jacobson mail.
1714          *
1715          *      Van's trick is to deposit buffers into socket queue
1716          *      on a device interrupt, to call tcp_recv function
1717          *      on the receive process context and checksum and copy
1718          *      the buffer to user space. smart...
1719          *
1720          *      Our current scheme is not silly either but we take the
1721          *      extra cost of the net_bh soft interrupt processing...
1722          *      We do checksum and copy also but from device to kernel.
1723          */
1724
1725         /*
1726          * RFC1323: H1. Apply PAWS check first.
1727          */
1728         if (tcp_fast_parse_options(sk, th, tp)) {
1729                 if (tp->saw_tstamp) {
1730                         if (tcp_paws_discard(tp, th, len)) {
1731                                 if (!th->rst) {
1732                                         tcp_send_ack(sk);
1733                                         goto discard;
1734                                 }
1735                         }
1736                         tcp_replace_ts_recent(sk, tp,
1737                                               TCP_SKB_CB(skb)->seq,
1738                                               TCP_SKB_CB(skb)->end_seq);
1739                 }
1740         }
1741
1742         flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1743
1744         /*      pred_flags is 0xS?10 << 16 + snd_wnd
1745          *      if header_predition is to be made
1746          *      'S' will always be tp->tcp_header_len >> 2
1747          *      '?' will be 0 else it will be !0
1748          *      (when there are holes in the receive
1749          *       space for instance)
1750          *      PSH flag is ignored.
1751          */
1752
1753         if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1754                 if (len <= th->doff*4) {
1755                         /* Bulk data transfer: sender */
1756                         if (len == th->doff*4) {
1757                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1758                                         TCP_SKB_CB(skb)->ack_seq, len);
1759                                 kfree_skb(skb);
1760                                 tcp_data_snd_check(sk);
1761                                 return 0;
1762                         } else { /* Header too small */
1763                                 tcp_statistics.TcpInErrs++;
1764                                 goto discard;
1765                         }
1766                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1767                            atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1768                         /* Bulk data transfer: receiver */
1769                         __skb_pull(skb,th->doff*4);
1770
1771                         tcp_measure_rcv_mss(sk, skb);
1772
1773                         /* DO NOT notify forward progress here.
1774                          * It saves dozen of CPU instructions in fast path. --ANK
1775                          */
1776                         __skb_queue_tail(&sk->receive_queue, skb);
1777                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1778
1779                         /* FIN bit check is not done since if FIN is set in
1780                          * this frame, the pred_flags won't match up. -DaveM
1781                          */
1782                         sk->data_ready(sk, 0);
1783                         tcp_delack_estimator(tp);
1784
1785                         tcp_remember_ack(tp, th, skb);
1786
1787                         __tcp_ack_snd_check(sk);
1788                         return 0;
1789                 }
1790         }
1791
1792         /*
1793          *      Standard slow path.
1794          */
1795
1796         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1797                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
1798                  * (RST) segments are validated by checking their SEQ-fields."
1799                  * And page 69: "If an incoming segment is not acceptable,
1800                  * an acknowledgment should be sent in reply (unless the RST bit
1801                  * is set, if so drop the segment and return)".
1802                  */
1803                 if (th->rst)
1804                         goto discard;
1805                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1806                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1807                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1808                                    tp->rcv_wup, tp->rcv_wnd);
1809                 }
1810                 tcp_send_ack(sk);
1811                 goto discard;
1812         }
1813
1814         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1815                 SOCK_DEBUG(sk, "syn in established state\n");
1816                 tcp_statistics.TcpInErrs++;
1817                 tcp_reset(sk, skb);
1818                 return 1;
1819         }
1820
1821         if(th->rst) {
1822                 tcp_reset(sk,skb);
1823                 goto discard;
1824         }
1825
1826         if(th->ack)
1827                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1828
1829         /* Process urgent data. */
1830         tcp_urg(sk, th, len);
1831
1832         /* step 7: process the segment text */
1833         queued = tcp_data(skb, sk, len);
1834
1835         /* This must be after tcp_data() does the skb_pull() to
1836          * remove the header size from skb->len.
1837          *
1838          * Dave!!! Phrase above (and all about rcv_mss) has
1839          * nothing to do with reality. rcv_mss must measure TOTAL
1840          * size, including sacks, IP options etc. Hence, measure_rcv_mss
1841          * must occure before pulling etc, otherwise it will flap
1842          * like hell. Even putting it before tcp_data is wrong,
1843          * it should use skb->tail - skb->nh.raw instead.
1844          *                                      --ANK (980805)
1845          *
1846          * BTW I broke it. Now all TCP options are handled equally
1847          * in mss_clamp calculations (i.e. ignored, rfc1122),
1848          * and mss_cache does include all of them (i.e. tstamps)
1849          * except for sacks, to calulate effective mss faster.
1850          *                                      --ANK (980805)
1851          */
1852         tcp_measure_rcv_mss(sk, skb);
1853
1854         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1855         if(sk->state != TCP_CLOSE) {
1856                 tcp_data_snd_check(sk);
1857                 tcp_ack_snd_check(sk);
1858         }
1859
1860         if (!queued) {
1861         discard:
1862                 kfree_skb(skb);
1863         }
1864
1865         return 0;
1866 }
1867
1868 /*
1869  *      Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1870  *      as an open_request.
1871  */
1872
1873 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
1874                            struct open_request *req)
1875 {
1876         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1877         u32 flg;
1878
1879         /*      assumption: the socket is not in use.
1880          *      as we checked the user count on tcp_rcv and we're
1881          *      running from a soft interrupt.
1882          */
1883
1884         /* Check for syn retransmission */
1885         flg = *(((u32 *)skb->h.th) + 3);
1886
1887         flg &= __constant_htonl(0x00170000);
1888         /* Only SYN set? */
1889         if (flg == __constant_htonl(0x00020000)) {
1890                 if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1891                         /*      retransmited syn.
1892                          */
1893                         req->class->rtx_syn_ack(sk, req);
1894                         return NULL;
1895                 } else {
1896                         return sk; /* Pass new SYN to the listen socket. */
1897                 }
1898         }
1899
1900         /* We know it's an ACK here */
1901         if (req->sk) {
1902                 /*      socket already created but not
1903                  *      yet accepted()...
1904                  */
1905                 sk = req->sk;
1906         } else {
1907                 /* In theory the packet could be for a cookie, but
1908                  * TIME_WAIT should guard us against this.
1909                  * XXX: Nevertheless check for cookies?
1910                  * This sequence number check is done again later,
1911                  * but we do it here to prevent syn flood attackers
1912                  * from creating big SYN_RECV sockets.
1913                  */
1914                 if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
1915                     !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
1916                              req->rcv_isn+1+req->rcv_wnd)) {
1917                         req->class->send_reset(skb);
1918                         return NULL;
1919                 }
1920
1921                 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1922                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1923                 if (sk == NULL)
1924                         return NULL;
1925
1926                 req->expires = 0UL;
1927                 req->sk = sk;
1928         }
1929         skb_orphan(skb);
1930         skb_set_owner_r(skb, sk);
1931         return sk;
1932 }
1933
1934 /*
1935  *      This function implements the receiving procedure of RFC 793 for
1936  *      all states except ESTABLISHED and TIME_WAIT.
1937  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
1938  *      address independent.
1939  */
1940
1941 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
1942                           struct tcphdr *th, unsigned len)
1943 {
1944         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1945         int queued = 0;
1946
1947         /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */
1948         switch (sk->state) {
1949         case TCP_LISTEN:
1950                 /* These use the socket TOS..
1951                  * might want to be the received TOS
1952                  */
1953                 if(th->ack)
1954                         return 1;
1955
1956                 if(th->syn) {
1957                         if(tp->af_specific->conn_request(sk, skb, 0) < 0)
1958                                 return 1;
1959
1960                         /* Now we have several options: In theory there is
1961                          * nothing else in the frame. KA9Q has an option to
1962                          * send data with the syn, BSD accepts data with the
1963                          * syn up to the [to be] advertised window and
1964                          * Solaris 2.1 gives you a protocol error. For now
1965                          * we just ignore it, that fits the spec precisely
1966                          * and avoids incompatibilities. It would be nice in
1967                          * future to drop through and process the data.
1968                          *
1969                          * Now that TTCP is starting to be used we ought to
1970                          * queue this data.
1971                          * But, this leaves one open to an easy denial of
1972                          * service attack, and SYN cookies can't defend
1973                          * against this problem. So, we drop the data
1974                          * in the interest of security over speed.
1975                          */
1976                         goto discard;
1977                 }
1978
1979                 goto discard;
1980                 break;
1981
1982         case TCP_SYN_SENT:
1983                 /* SYN sent means we have to look for a suitable ack and
1984                  * either reset for bad matches or go to connected.
1985                  * The SYN_SENT case is unusual and should
1986                  * not be in line code. [AC]
1987                  */
1988                 if(th->ack) {
1989                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
1990
1991                         /* We got an ack, but it's not a good ack. */
1992                         if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
1993                                     TCP_SKB_CB(skb)->ack_seq, len)) {
1994                                 sk->err = ECONNRESET;
1995                                 sk->state_change(sk);
1996                                 tcp_statistics.TcpAttemptFails++;
1997                                 return 1;
1998                         }
1999
2000                         if(th->rst) {
2001                                 tcp_reset(sk,skb);
2002                                 goto discard;
2003                         }
2004
2005                         if(!th->syn) {
2006                                 /* A valid ack from a different connection
2007                                  * start.  Shouldn't happen but cover it.
2008                                  */
2009                                 sk->err = ECONNRESET;
2010                                 sk->state_change(sk);
2011                                 tcp_statistics.TcpAttemptFails++;
2012                                 return 1;
2013                         }
2014
2015                         /* Ok.. it's good. Set up sequence numbers and
2016                          * move to established.
2017                          */
2018                         tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2019                         tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2020
2021                         /* RFC1323: The window in SYN & SYN/ACK segments is
2022                          * never scaled.
2023                          */
2024                         tp->snd_wnd = htons(th->window);
2025                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2026                         tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2027                         tp->fin_seq = TCP_SKB_CB(skb)->seq;
2028
2029                         tcp_set_state(sk, TCP_ESTABLISHED);
2030                         tcp_parse_options(sk, th, tp, 0);
2031
2032                         if (tp->wscale_ok == 0) {
2033                                 tp->snd_wscale = tp->rcv_wscale = 0;
2034                                 tp->window_clamp = min(tp->window_clamp,65535);
2035                         }
2036
2037                         if (tp->tstamp_ok) {
2038                                 tp->tcp_header_len =
2039                                         sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2040                         } else
2041                                 tp->tcp_header_len = sizeof(struct tcphdr);
2042                         if (tp->saw_tstamp) {
2043                                 tp->ts_recent = tp->rcv_tsval;
2044                                 tp->ts_recent_stamp = jiffies;
2045                         }
2046
2047                         /* Can't be earlier, doff would be wrong. */
2048                         tcp_send_ack(sk);
2049
2050                         sk->dport = th->source;
2051                         tp->copied_seq = tp->rcv_nxt;
2052
2053                         if(!sk->dead) {
2054                                 sk->state_change(sk);
2055                                 sock_wake_async(sk->socket, 0);
2056                         }
2057                 } else {
2058                         if(th->syn && !th->rst) {
2059                                 /* The previous version of the code
2060                                  * checked for "connecting to self"
2061                                  * here. that check is done now in
2062                                  * tcp_connect.
2063                                  */
2064                                 tcp_set_state(sk, TCP_SYN_RECV);
2065                                 tcp_parse_options(sk, th, tp, 0);
2066                                 if (tp->saw_tstamp) {
2067                                         tp->ts_recent = tp->rcv_tsval;
2068                                         tp->ts_recent_stamp = jiffies;
2069                                 }
2070
2071                                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2072                                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2073
2074                                 /* RFC1323: The window in SYN & SYN/ACK segments is
2075                                  * never scaled.
2076                                  */
2077                                 tp->snd_wnd = htons(th->window);
2078                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2079
2080                                 tcp_send_synack(sk);
2081                         } else
2082                                 break;
2083                 }
2084
2085                 /* tp->tcp_header_len and tp->mss_clamp
2086                    probably changed, synchronize mss.
2087                    */
2088                 tcp_sync_mss(sk, tp->pmtu_cookie);
2089                 tp->rcv_mss = tp->mss_cache;
2090
2091                 if (sk->state == TCP_SYN_RECV)
2092                         goto discard;
2093
2094                 goto step6;
2095         }
2096
2097         /*   Parse the tcp_options present on this header.
2098          *   By this point we really only expect timestamps.
2099          *   Note that this really has to be here and not later for PAWS
2100          *   (RFC1323) to work.
2101          */
2102         if (tcp_fast_parse_options(sk, th, tp)) {
2103                 /* NOTE: assumes saw_tstamp is never set if we didn't
2104                  * negotiate the option. tcp_fast_parse_options() must
2105                  * guarantee this.
2106                  */
2107                 if (tp->saw_tstamp) {
2108                         if (tcp_paws_discard(tp, th, len)) {
2109                                 if (!th->rst) {
2110                                         tcp_send_ack(sk);
2111                                         goto discard;
2112                                 }
2113                         }
2114                         tcp_replace_ts_recent(sk, tp,
2115                                               TCP_SKB_CB(skb)->seq,
2116                                               TCP_SKB_CB(skb)->end_seq);
2117                 }
2118         }
2119
2120         /* step 1: check sequence number */
2121         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2122                 if (!th->rst) {
2123                         tcp_send_ack(sk);
2124                         goto discard;
2125                 }
2126         }
2127
2128         /* step 2: check RST bit */
2129         if(th->rst) {
2130                 tcp_reset(sk,skb);
2131                 goto discard;
2132         }
2133
2134         /* step 3: check security and precedence [ignored] */
2135
2136         /*      step 4:
2137          *
2138          *      Check for a SYN, and ensure it matches the SYN we were
2139          *      first sent. We have to handle the rather unusual (but valid)
2140          *      sequence that KA9Q derived products may generate of
2141          *
2142          *      SYN
2143          *                              SYN|ACK Data
2144          *      ACK     (lost)
2145          *                              SYN|ACK Data + More Data
2146          *      .. we must ACK not RST...
2147          *
2148          *      We keep syn_seq as the sequence space occupied by the
2149          *      original syn.
2150          */
2151
2152         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2153                 tcp_reset(sk, skb);
2154                 return 1;
2155         }
2156
2157         /* step 5: check the ACK field */
2158         if (th->ack) {
2159                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2160                                          TCP_SKB_CB(skb)->ack_seq, len);
2161
2162                 switch(sk->state) {
2163                 case TCP_SYN_RECV:
2164                         if (acceptable) {
2165                                 tcp_set_state(sk, TCP_ESTABLISHED);
2166                                 sk->dport = th->source;
2167                                 tp->copied_seq = tp->rcv_nxt;
2168
2169                                 if(!sk->dead)
2170                                         sk->state_change(sk);
2171
2172                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2173                                 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2174                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2175                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2176
2177                         } else {
2178                                 SOCK_DEBUG(sk, "bad ack\n");
2179                                 return 1;
2180                         }
2181                         break;
2182
2183                 case TCP_FIN_WAIT1:
2184                         if (tp->snd_una == tp->write_seq) {
2185                                 sk->shutdown |= SEND_SHUTDOWN;
2186                                 tcp_set_state(sk, TCP_FIN_WAIT2);
2187                                 if (!sk->dead)
2188                                         sk->state_change(sk);
2189                                 else
2190                                         tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2191                         }
2192                         break;
2193
2194                 case TCP_CLOSING:
2195                         if (tp->snd_una == tp->write_seq) {
2196                                 tcp_time_wait(sk);
2197                                 goto discard;
2198                         }
2199                         break;
2200
2201                 case TCP_LAST_ACK:
2202                         if (tp->snd_una == tp->write_seq) {
2203                                 sk->shutdown = SHUTDOWN_MASK;
2204                                 tcp_set_state(sk,TCP_CLOSE);
2205                                 if (!sk->dead)
2206                                         sk->state_change(sk);
2207                                 goto discard;
2208                         }
2209                         break;
2210                 }
2211         } else
2212                 goto discard;
2213
2214 step6:
2215         /* step 6: check the URG bit */
2216         tcp_urg(sk, th, len);
2217
2218         /* step 7: process the segment text */
2219         switch (sk->state) {
2220         case TCP_CLOSE_WAIT:
2221         case TCP_CLOSING:
2222                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2223                         break;
2224
2225         case TCP_FIN_WAIT1:
2226         case TCP_FIN_WAIT2:
2227                 /* RFC 793 says to queue data in these states,
2228                  * RFC 1122 says we MUST send a reset.
2229                  * BSD 4.4 also does reset.
2230                  */
2231                 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2232                         if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2233                                 tcp_reset(sk, skb);
2234                                 return 1;
2235                         }
2236                 }
2237
2238         case TCP_ESTABLISHED:
2239                 queued = tcp_data(skb, sk, len);
2240
2241                 /* This must be after tcp_data() does the skb_pull() to
2242                  * remove the header size from skb->len.
2243                  */
2244                 tcp_measure_rcv_mss(sk, skb);
2245                 break;
2246         }
2247
2248         tcp_data_snd_check(sk);
2249         tcp_ack_snd_check(sk);
2250
2251         if (!queued) {
2252 discard:
2253                 kfree_skb(skb);
2254         }
2255         return 0;
2256 }