net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.153 1999/01/20 07:20:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <linux/ipsec.h>
  65
  66 #ifdef CONFIG_SYSCTL
  67 #define SYNC_INIT 0 /* let the user enable it */
  68 #else
  69 #define SYNC_INIT 1
  70 #endif
  71
  72 extern int sysctl_tcp_fin_timeout;
  73
  74 /* These are on by default so the code paths get tested.
  75  * For the final 2.2 this may be undone at our discretion. -DaveM
  76  */
  77 int sysctl_tcp_timestamps = 1;
  78 int sysctl_tcp_window_scaling = 1;
  79 int sysctl_tcp_sack = 1;
  80
  81 int sysctl_tcp_syncookies = SYNC_INIT;
  82 int sysctl_tcp_stdurg;
  83 int sysctl_tcp_rfc1337;
  84
  85 static int prune_queue(struct sock *sk);
  86
  87 /* There is something which you must keep in mind when you analyze the
  88  * behavior of the tp->ato delayed ack timeout interval.  When a
  89  * connection starts up, we want to ack as quickly as possible.  The
  90  * problem is that "good" TCP's do slow start at the beginning of data
  91  * transmission.  The means that until we send the first few ACK's the
  92  * sender will sit on his end and only queue most of his data, because
  93  * he can only send snd_cwnd unacked packets at any given time.  For
  94  * each ACK we send, he increments snd_cwnd and transmits more of his
  95  * queue.  -DaveM
  96  */
  97 static void tcp_delack_estimator(struct tcp_opt *tp)
  98 {
  99         if(tp->ato == 0) {
 100                 tp->lrcvtime = jiffies;
 101
 102                 /* Help sender leave slow start quickly,
 103                  * and also makes sure we do not take this
 104                  * branch ever again for this connection.
 105                  */
 106                 tp->ato = 1;
 107                 tcp_enter_quickack_mode(tp);
 108         } else {
 109                 int m = jiffies - tp->lrcvtime;
 110
 111                 tp->lrcvtime = jiffies;
 112                 if(m <= 0)
 113                         m = 1;
 114                 if(m > tp->rto)
 115                         tp->ato = tp->rto;
 116                 else {
 117                         /* This funny shift makes sure we
 118                          * clear the "quick ack mode" bit.
 119                          */
 120                         tp->ato = ((tp->ato << 1) >> 2) + m;
 121                 }
 122         }
 123 }
 124
 125 /*
 126  * Remember to send an ACK later.
 127  */
 128 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
 129                                         struct sk_buff *skb)
 130 {
 131         tp->delayed_acks++;
 132
 133         /* Tiny-grams with PSH set make us ACK quickly.
 134          * Note: This also clears the "quick ack mode" bit.
 135          */
 136         if(th->psh && (skb->len < (tp->mss_cache >> 1)))
 137                 tp->ato = HZ/50;
 138 }
 139
 140 /* Called to compute a smoothed rtt estimate. The data fed to this
 141  * routine either comes from timestamps, or from segments that were
 142  * known _not_ to have been retransmitted [see Karn/Partridge
 143  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 144  * piece by Van Jacobson.
 145  * NOTE: the next three routines used to be one big routine.
 146  * To save cycles in the RFC 1323 implementation it was better to break
 147  * it up into three procedures. -- erics
 148  */
 149
 150 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 151 {
 152         long m = mrtt; /* RTT */
 153
 154         /*      The following amusing code comes from Jacobson's
 155          *      article in SIGCOMM '88.  Note that rtt and mdev
 156          *      are scaled versions of rtt and mean deviation.
 157          *      This is designed to be as fast as possible
 158          *      m stands for "measurement".
 159          *
 160          *      On a 1990 paper the rto value is changed to:
 161          *      RTO = rtt + 4 * mdev
 162          */
 163         if(m == 0)
 164                 m = 1;
 165         if (tp->srtt != 0) {
 166                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 167                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 168                 if (m < 0)
 169                         m = -m;         /* m is now abs(error) */
 170                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 171                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 172         } else {
 173                 /* no previous measure. */
 174                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 175                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 176         }
 177 }
 178
 179 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 180  * routine referred to above.
 181  */
 182
 183 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 184 {
 185         tp->rto = (tp->srtt >> 3) + tp->mdev;
 186         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 187 }
 188
 189
 190 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 191  * on packet lifetime in the internet. We need the HZ/5 lower
 192  * bound to behave correctly against BSD stacks with a fixed
 193  * delayed ack.
 194  * FIXME: It's not entirely clear this lower bound is the best
 195  * way to avoid the problem. Is it possible to drop the lower
 196  * bound and still avoid trouble with BSD stacks? Perhaps
 197  * some modification to the RTO calculation that takes delayed
 198  * ack bias into account? This needs serious thought. -- erics
 199  */
 200 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 201 {
 202         if (tp->rto > 120*HZ)
 203                 tp->rto = 120*HZ;
 204         if (tp->rto < HZ/5)
 205                 tp->rto = HZ/5;
 206 }
 207
 208 /* WARNING: this must not be called if tp->saw_timestamp was false. */
 209 extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
 210                                              __u32 start_seq, __u32 end_seq)
 211 {
 212         /* From draft-ietf-tcplw-high-performance: the correct
 213          * test is last_ack_sent <= end_seq.
 214          * (RFC1323 stated last_ack_sent < end_seq.)
 215          *
 216          * HOWEVER: The current check contradicts the draft statements.
 217          * It has been done for good reasons.
 218          * The implemented check improves security and eliminates
 219          * unnecessary RTT overestimation.
 220          *              1998/06/27  Andrey V. Savochkin <saw@msu.ru>
 221          */
 222         if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
 223             !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
 224                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 225                  * extra check below makes sure this can only happen
 226                  * for pure ACK frames.  -DaveM
 227                  */
 228                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
 229                         tp->ts_recent = tp->rcv_tsval;
 230                         tp->ts_recent_stamp = jiffies;
 231                 }
 232         }
 233 }
 234
 235 #define PAWS_24DAYS     (HZ * 60 * 60 * 24 * 24)
 236
 237 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 238 {
 239         /* ts_recent must be younger than 24 days */
 240         return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
 241                 (((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
 242                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
 243                  (len != (th->doff * 4))));
 244 }
 245
 246
 247 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 248 {
 249         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 250
 251         if (tp->rcv_wnd &&
 252             after(end_seq, tp->rcv_nxt) &&
 253             before(seq, end_window))
 254                 return 1;
 255         if (seq != end_window)
 256                 return 0;
 257         return (seq == end_seq);
 258 }
 259
 260 /* This functions checks to see if the tcp header is actually acceptable. */
 261 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 262 {
 263         if (seq == tp->rcv_nxt)
 264                 return (tp->rcv_wnd || (end_seq == seq));
 265
 266         return __tcp_sequence(tp, seq, end_seq);
 267 }
 268
 269 /* When we get a reset we do this. */
 270 static void tcp_reset(struct sock *sk)
 271 {
 272         sk->zapped = 1;
 273
 274         /* We want the right error as BSD sees it (and indeed as we do). */
 275         switch (sk->state) {
 276                 case TCP_SYN_SENT:
 277                         sk->err = ECONNREFUSED;
 278                         break;
 279                 case TCP_CLOSE_WAIT:
 280                         sk->err = EPIPE;
 281                         break;
 282                 default:
 283                         sk->err = ECONNRESET;
 284         };
 285         tcp_set_state(sk, TCP_CLOSE);
 286         sk->shutdown = SHUTDOWN_MASK;
 287         if (!sk->dead)
 288                 sk->state_change(sk);
 289 }
 290
 291 /* This tags the retransmission queue when SACKs arrive. */
 292 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 293 {
 294         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 295         int i = nsacks;
 296
 297         while(i--) {
 298                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 299                 __u32 start_seq = ntohl(sp->start_seq);
 300                 __u32 end_seq = ntohl(sp->end_seq);
 301                 int fack_count = 0;
 302
 303                 while((skb != NULL) &&
 304                       (skb != tp->send_head) &&
 305                       (skb != (struct sk_buff *)&sk->write_queue)) {
 306                         /* The retransmission queue is always in order, so
 307                          * we can short-circuit the walk early.
 308                          */
 309                         if(after(TCP_SKB_CB(skb)->seq, end_seq))
 310                                 break;
 311
 312                         /* We play conservative, we don't allow SACKS to partially
 313                          * tag a sequence space.
 314                          */
 315                         fack_count++;
 316                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 317                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 318                                 /* If this was a retransmitted frame, account for it. */
 319                                 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 320                                    tp->retrans_out)
 321                                         tp->retrans_out--;
 322                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 323
 324                                 /* RULE: All new SACKs will either decrease retrans_out
 325                                  *       or advance fackets_out.
 326                                  */
 327                                 if(fack_count > tp->fackets_out)
 328                                         tp->fackets_out = fack_count;
 329                         }
 330                         skb = skb->next;
 331                 }
 332                 sp++; /* Move on to the next SACK block. */
 333         }
 334 }
 335
 336 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 337  * But, this can also be called on packets in the established flow when
 338  * the fast version below fails.
 339  */
 340 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 341 {
 342         unsigned char *ptr;
 343         int length=(th->doff*4)-sizeof(struct tcphdr);
 344
 345         ptr = (unsigned char *)(th + 1);
 346         tp->saw_tstamp = 0;
 347
 348         while(length>0) {
 349                 int opcode=*ptr++;
 350                 int opsize;
 351
 352                 switch (opcode) {
 353                         case TCPOPT_EOL:
 354                                 return;
 355                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 356                                 length--;
 357                                 continue;
 358                         default:
 359                                 opsize=*ptr++;
 360                                 if (opsize < 2) /* "silly options" */
 361                                         return;
 362                                 if (opsize > length)
 363                                         break;  /* don't parse partial options */
 364                                 switch(opcode) {
 365                                 case TCPOPT_MSS:
 366                                         if(opsize==TCPOLEN_MSS && th->syn) {
 367                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 368                                                 if (in_mss == 0)
 369                                                         in_mss = 536;
 370                                                 if (tp->mss_clamp > in_mss)
 371                                                         tp->mss_clamp = in_mss;
 372                                         }
 373                                         break;
 374                                 case TCPOPT_WINDOW:
 375                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 376                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 377                                                         tp->wscale_ok = 1;
 378                                                         tp->snd_wscale = *(__u8 *)ptr;
 379                                                         if(tp->snd_wscale > 14) {
 380                                                                 if(net_ratelimit())
 381                                                                         printk("tcp_parse_options: Illegal window "
 382                                                                                "scaling value %d >14 received.",
 383                                                                                tp->snd_wscale);
 384                                                                 tp->snd_wscale = 14;
 385                                                         }
 386                                                 }
 387                                         break;
 388                                 case TCPOPT_TIMESTAMP:
 389                                         if(opsize==TCPOLEN_TIMESTAMP) {
 390                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 391                                                         tp->tstamp_ok = 1;
 392                                                         tp->saw_tstamp = 1;
 393                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 394                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 395                                                 }
 396                                         }
 397                                         break;
 398                                 case TCPOPT_SACK_PERM:
 399                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 400                                                 if (sysctl_tcp_sack && !no_fancy) {
 401                                                         tp->sack_ok = 1;
 402                                                         tp->num_sacks = 0;
 403                                                 }
 404                                         }
 405                                         break;
 406
 407                                 case TCPOPT_SACK:
 408                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 409                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 410                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 411
 412                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 413                                                         int num_sacks = sack_bytes >> 3;
 414                                                         struct tcp_sack_block *sackp;
 415
 416                                                         sackp = (struct tcp_sack_block *)ptr;
 417                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 418                                                 }
 419                                         }
 420                                 };
 421                                 ptr+=opsize-2;
 422                                 length-=opsize;
 423                 };
 424         }
 425 }
 426
 427 /* Fast parse options. This hopes to only see timestamps.
 428  * If it is wrong it falls back on tcp_parse_options().
 429  */
 430 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 431 {
 432         /* If we didn't send out any options ignore them all. */
 433         if (tp->tcp_header_len == sizeof(struct tcphdr))
 434                 return 0;
 435         if (th->doff == sizeof(struct tcphdr)>>2) {
 436                 tp->saw_tstamp = 0;
 437                 return 0;
 438         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 439                 __u32 *ptr = (__u32 *)(th + 1);
 440                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 441                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 442                         tp->saw_tstamp = 1;
 443                         tp->rcv_tsval = ntohl(*++ptr);
 444                         tp->rcv_tsecr = ntohl(*++ptr);
 445                         return 1;
 446                 }
 447         }
 448         tcp_parse_options(sk, th, tp, 0);
 449         return 1;
 450 }
 451
 452 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 453 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 454 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 455 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 456
 457 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 458 {
 459         if (tp->dup_acks > 3)
 460                 tp->snd_cwnd = (tp->snd_ssthresh);
 461
 462         tp->dup_acks = 0;
 463 }
 464
 465 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 466  * retransmit timer fires.
 467  */
 468 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 469 {
 470         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 471
 472         /* Note: If not_dup is set this implies we got a
 473          * data carrying packet or a window update.
 474          * This carries no new information about possible
 475          * lost packets, so we have to ignore it for the purposes
 476          * of counting duplicate acks. Ideally this does not imply we
 477          * should stop our fast retransmit phase, more acks may come
 478          * later without data to help us. Unfortunately this would make
 479          * the code below much more complex. For now if I see such
 480          * a packet I clear the fast retransmit phase.
 481          */
 482         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 483                 /* This is the standard reno style fast retransmit branch. */
 484
 485                 /* 1. When the third duplicate ack is received, set ssthresh
 486                  * to one half the current congestion window, but no less
 487                  * than two segments. Retransmit the missing segment.
 488                  */
 489                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 490                         tp->dup_acks++;
 491                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 492                                 tp->snd_ssthresh =
 493                                         max(min(tp->snd_wnd, tp->snd_cwnd) >> 1, 2);
 494                                 tp->snd_cwnd = (tp->snd_ssthresh + 3);
 495                                 tp->high_seq = tp->snd_nxt;
 496                                 if(!tp->fackets_out)
 497                                         tcp_retransmit_skb(sk,
 498                                                            skb_peek(&sk->write_queue));
 499                                 else
 500                                         tcp_fack_retransmit(sk);
 501                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 502                         }
 503                 } else if (++tp->dup_acks > 3) {
 504                         /* 2. Each time another duplicate ACK arrives, increment
 505                          * cwnd by the segment size. [...] Transmit a packet...
 506                          *
 507                          * Packet transmission will be done on normal flow processing
 508                          * since we're not in "retransmit mode".  We do not use
 509                          * duplicate ACKs to artificially inflate the congestion
 510                          * window when doing FACK.
 511                          */
 512                         if(!tp->fackets_out) {
 513                                 tp->snd_cwnd++;
 514                         } else {
 515                                 /* Fill any further holes which may have
 516                                  * appeared.
 517                                  *
 518                                  * We may want to change this to run every
 519                                  * further multiple-of-3 dup ack increments,
 520                                  * to be more robust against out-of-order
 521                                  * packet delivery.  -DaveM
 522                                  */
 523                                 tcp_fack_retransmit(sk);
 524                         }
 525                 }
 526         } else if (tp->high_seq != 0) {
 527                 /* In this branch we deal with clearing the Floyd style
 528                  * block on duplicate fast retransmits, and if requested
 529                  * we do Hoe style secondary fast retransmits.
 530                  */
 531                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 532                         /* Once we have acked all the packets up to high_seq
 533                          * we are done this fast retransmit phase.
 534                          * Alternatively data arrived. In this case we
 535                          * Have to abort the fast retransmit attempt.
 536                          * Note that we do want to accept a window
 537                          * update since this is expected with Hoe's algorithm.
 538                          */
 539                         clear_fast_retransmit(tp);
 540
 541                         /* After we have cleared up to high_seq we can
 542                          * clear the Floyd style block.
 543                          */
 544                         if (!before(ack, tp->high_seq)) {
 545                                 tp->high_seq = 0;
 546                                 tp->fackets_out = 0;
 547                         }
 548                 } else if (tp->dup_acks >= 3) {
 549                         if (!tp->fackets_out) {
 550                                 /* Hoe Style. We didn't ack the whole
 551                                  * window. Take this as a cue that
 552                                  * another packet was lost and retransmit it.
 553                                  * Don't muck with the congestion window here.
 554                                  * Note that we have to be careful not to
 555                                  * act if this was a window update and it
 556                                  * didn't ack new data, since this does
 557                                  * not indicate a packet left the system.
 558                                  * We can test this by just checking
 559                                  * if ack changed from snd_una, since
 560                                  * the only way to get here without advancing
 561                                  * from snd_una is if this was a window update.
 562                                  */
 563                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 564                                         tcp_retransmit_skb(sk,
 565                                                            skb_peek(&sk->write_queue));
 566                                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 567                                 }
 568                         } else {
 569                                 /* FACK style, fill any remaining holes in
 570                                  * receiver's queue.
 571                                  */
 572                                 tcp_fack_retransmit(sk);
 573                         }
 574                 }
 575         }
 576 }
 577
 578 /* This is Jacobson's slow start and congestion avoidance.
 579  * SIGCOMM '88, p. 328.
 580  */
 581 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 582 {
 583         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 584                 /* In "safe" area, increase. */
 585                 tp->snd_cwnd++;
 586         } else {
 587                 /* In dangerous area, increase slowly.
 588                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 589                  */
 590                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 591                         tp->snd_cwnd++;
 592                         tp->snd_cwnd_cnt=0;
 593                 } else
 594                         tp->snd_cwnd_cnt++;
 595         }
 596 }
 597
 598 /* Remove acknowledged frames from the retransmission queue. */
 599 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 600                                __u32 *seq, __u32 *seq_rtt)
 601 {
 602         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 603         struct sk_buff *skb;
 604         unsigned long now = jiffies;
 605         int acked = 0;
 606
 607         /* If we are retransmitting, and this ACK clears up to
 608          * the retransmit head, or further, then clear our state.
 609          */
 610         if (tp->retrans_head != NULL &&
 611             !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 612                 tp->retrans_head = NULL;
 613
 614         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 615                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 616                 __u8 sacked = scb->sacked;
 617
 618                 /* If our packet is before the ack sequence we can
 619                  * discard it as it's confirmed to have arrived at
 620                  * the other end.
 621                  */
 622                 if (after(scb->end_seq, ack))
 623                         break;
 624
 625                 /* Initial outgoing SYN's get put onto the write_queue
 626                  * just like anything else we transmit.  It is not
 627                  * true data, and if we misinform our callers that
 628                  * this ACK acks real data, we will erroneously exit
 629                  * connection startup slow start one packet too
 630                  * quickly.  This is severely frowned upon behavior.
 631                  */
 632                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 633                         tp->retrans_out--;
 634                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 635                         acked |= FLAG_DATA_ACKED;
 636                         if(sacked & TCPCB_SACKED_RETRANS)
 637                                 acked |= FLAG_RETRANS_DATA_ACKED;
 638                         if(tp->fackets_out)
 639                                 tp->fackets_out--;
 640                 } else {
 641                         /* This is pure paranoia. */
 642                         tp->retrans_head = NULL;
 643                 }
 644                 tp->packets_out--;
 645                 *seq = scb->seq;
 646                 *seq_rtt = now - scb->when;
 647                 __skb_unlink(skb, skb->list);
 648                 kfree_skb(skb);
 649         }
 650         return acked;
 651 }
 652
 653 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 654 {
 655         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 656
 657         /* Our probe was answered. */
 658         tp->probes_out = 0;
 659
 660         /* Was it a usable window open? */
 661
 662         /* should always be non-null */
 663         if (tp->send_head != NULL &&
 664             !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 665                 tp->backoff = 0;
 666                 tp->pending = 0;
 667                 tcp_clear_xmit_timer(sk, TIME_PROBE0);
 668         } else {
 669                 tcp_reset_xmit_timer(sk, TIME_PROBE0,
 670                                      min(tp->rto << tp->backoff, 120*HZ));
 671         }
 672 }
 673
 674 /* Should we open up the congestion window? */
 675 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 676 {
 677         /* Data must have been acked. */
 678         if ((flag & FLAG_DATA_ACKED) == 0)
 679                 return 0;
 680
 681         /* Some of the data acked was retransmitted somehow? */
 682         if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 683                 /* We advance in all cases except during
 684                  * non-FACK fast retransmit/recovery.
 685                  */
 686                 if (tp->fackets_out != 0 ||
 687                     tp->retransmits != 0)
 688                         return 1;
 689
 690                 /* Non-FACK fast retransmit does it's own
 691                  * congestion window management, don't get
 692                  * in the way.
 693                  */
 694                 return 0;
 695         }
 696
 697         /* New non-retransmitted data acked, always advance.  */
 698         return 1;
 699 }
 700
 701 /* Read draft-ietf-tcplw-high-performance before mucking
 702  * with this code. (Superceeds RFC1323)
 703  */
 704 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 705                                u32 seq, u32 ack, int flag)
 706 {
 707         __u32 seq_rtt;
 708
 709         /* RTTM Rule: A TSecr value received in a segment is used to
 710          * update the averaged RTT measurement only if the segment
 711          * acknowledges some new data, i.e., only if it advances the
 712          * left edge of the send window.
 713          *
 714          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 715          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 716          */
 717         if (!(flag & FLAG_DATA_ACKED))
 718                 return;
 719
 720         seq_rtt = jiffies-tp->rcv_tsecr;
 721         tcp_rtt_estimator(tp, seq_rtt);
 722         if (tp->retransmits) {
 723                 if (tp->packets_out == 0) {
 724                         tp->retransmits = 0;
 725                         tp->fackets_out = 0;
 726                         tp->retrans_out = 0;
 727                         tp->backoff = 0;
 728                         tcp_set_rto(tp);
 729                 } else {
 730                         /* Still retransmitting, use backoff */
 731                         tcp_set_rto(tp);
 732                         tp->rto = tp->rto << tp->backoff;
 733                 }
 734         } else {
 735                 tcp_set_rto(tp);
 736         }
 737
 738         tcp_bound_rto(tp);
 739 }
 740
 741 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 742 {
 743         struct sk_buff *skb = skb_peek(&sk->write_queue);
 744         long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
 745
 746         /* Some data was ACK'd, if still retransmitting (due to a
 747          * timeout), resend more of the retransmit queue.  The
 748          * congestion window is handled properly by that code.
 749          */
 750         if (tp->retransmits) {
 751                 tcp_xmit_retransmit_queue(sk);
 752                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 753         } else {
 754                 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 755         }
 756 }
 757
 758 /* This routine deals with incoming acks, but not outgoing ones. */
 759 static int tcp_ack(struct sock *sk, struct tcphdr *th,
 760                    u32 ack_seq, u32 ack, int len)
 761 {
 762         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 763         int flag = 0;
 764         u32 seq = 0;
 765         u32 seq_rtt = 0;
 766
 767         if(sk->zapped)
 768                 return(1);      /* Dead, can't ack any more so why bother */
 769
 770         if (tp->pending == TIME_KEEPOPEN)
 771                 tp->probes_out = 0;
 772
 773         tp->rcv_tstamp = jiffies;
 774
 775         /* If the ack is newer than sent or older than previous acks
 776          * then we can probably ignore it.
 777          */
 778         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 779                 goto uninteresting_ack;
 780
 781         dst_confirm(sk->dst_cache);
 782
 783         /* If there is data set flag 1 */
 784         if (len != th->doff*4) {
 785                 flag |= FLAG_DATA;
 786                 tcp_delack_estimator(tp);
 787         }
 788
 789         /* Update our send window. */
 790
 791         /* This is the window update code as per RFC 793
 792          * snd_wl{1,2} are used to prevent unordered
 793          * segments from shrinking the window
 794          */
 795         if (before(tp->snd_wl1, ack_seq) ||
 796             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 797                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
 798
 799                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 800                         flag |= FLAG_WIN_UPDATE;
 801                         tp->snd_wnd = nwin;
 802
 803                         tp->snd_wl1 = ack_seq;
 804                         tp->snd_wl2 = ack;
 805
 806                         if (nwin > tp->max_window)
 807                                 tp->max_window = nwin;
 808                 }
 809         }
 810
 811         /* We passed data and got it acked, remove any soft error
 812          * log. Something worked...
 813          */
 814         sk->err_soft = 0;
 815
 816         /* If this ack opens up a zero window, clear backoff.  It was
 817          * being used to time the probes, and is probably far higher than
 818          * it needs to be for normal retransmission.
 819          */
 820         if (tp->pending == TIME_PROBE0)
 821                 tcp_ack_probe(sk, ack);
 822
 823         /* See if we can take anything off of the retransmit queue. */
 824         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 825
 826         /* We must do this here, before code below clears out important
 827          * state contained in tp->fackets_out and tp->retransmits.  -DaveM
 828          */
 829         if (should_advance_cwnd(tp, flag))
 830                 tcp_cong_avoid(tp);
 831
 832         /* If we have a timestamp, we always do rtt estimates. */
 833         if (tp->saw_tstamp) {
 834                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
 835         } else {
 836                 /* If we were retransmiting don't count rtt estimate. */
 837                 if (tp->retransmits) {
 838                         if (tp->packets_out == 0) {
 839                                 tp->retransmits = 0;
 840                                 tp->fackets_out = 0;
 841                                 tp->retrans_out = 0;
 842                         }
 843                 } else {
 844                         /* We don't have a timestamp. Can only use
 845                          * packets that are not retransmitted to determine
 846                          * rtt estimates. Also, we must not reset the
 847                          * backoff for rto until we get a non-retransmitted
 848                          * packet. This allows us to deal with a situation
 849                          * where the network delay has increased suddenly.
 850                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 851                          */
 852                         if (flag & FLAG_DATA_ACKED) {
 853                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
 854                                         tp->backoff = 0;
 855                                         tcp_rtt_estimator(tp, seq_rtt);
 856                                         tcp_set_rto(tp);
 857                                         tcp_bound_rto(tp);
 858                                 }
 859                         }
 860                 }
 861         }
 862
 863         if (tp->packets_out) {
 864                 if (flag & FLAG_DATA_ACKED)
 865                         tcp_ack_packets_out(sk, tp);
 866         } else {
 867                 tcp_clear_xmit_timer(sk, TIME_RETRANS);
 868         }
 869
 870         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
 871         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
 872             (tp->high_seq != 0)) {
 873                 tcp_fast_retrans(sk, ack, flag);
 874         } else {
 875                 /* Clear any aborted fast retransmit starts. */
 876                 tp->dup_acks = 0;
 877         }
 878         /* Remember the highest ack received. */
 879         tp->snd_una = ack;
 880         return 1;
 881
 882 uninteresting_ack:
 883         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 884         return 0;
 885 }
 886
 887 /* New-style handling of TIME_WAIT sockets. */
 888 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
 889 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
 890 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
 891
 892 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 893 {
 894         /* Unlink from various places. */
 895         if(tw->bind_next)
 896                 tw->bind_next->bind_pprev = tw->bind_pprev;
 897         *(tw->bind_pprev) = tw->bind_next;
 898         if(tw->tb->owners == NULL)
 899                 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 900
 901         if(tw->next)
 902                 tw->next->pprev = tw->pprev;
 903         *tw->pprev = tw->next;
 904
 905         /* We decremented the prot->inuse count when we entered TIME_WAIT
 906          * and the sock from which this came was destroyed.
 907          */
 908         tw->sklist_next->sklist_prev = tw->sklist_prev;
 909         tw->sklist_prev->sklist_next = tw->sklist_next;
 910
 911         /* Ok, now free it up. */
 912         kmem_cache_free(tcp_timewait_cachep, tw);
 913 }
 914
 915 /* We come here as a special case from the AF specific TCP input processing,
 916  * and the SKB has no owner.  Essentially handling this is very simple,
 917  * we just keep silently eating rx'd packets until none show up for the
 918  * entire timeout period.  The only special cases are for BSD TIME_WAIT
 919  * reconnects and SYN/RST bits being set in the TCP header.
 920  */
 921 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 922                                struct tcphdr *th, unsigned len)
 923 {
 924         /*      RFC 1122:
 925          *      "When a connection is [...] on TIME-WAIT state [...]
 926          *      [a TCP] MAY accept a new SYN from the remote TCP to
 927          *      reopen the connection directly, if it:
 928          *
 929          *      (1)  assigns its initial sequence number for the new
 930          *      connection to be larger than the largest sequence
 931          *      number it used on the previous connection incarnation,
 932          *      and
 933          *
 934          *      (2)  returns to TIME-WAIT state if the SYN turns out
 935          *      to be an old duplicate".
 936          */
 937         if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
 938                 struct sock *sk;
 939                 struct tcp_func *af_specific = tw->af_specific;
 940                 __u32 isn;
 941
 942                 isn = tw->rcv_nxt + 128000;
 943                 if(isn == 0)
 944                         isn++;
 945                 tcp_tw_deschedule(tw);
 946                 tcp_timewait_kill(tw);
 947                 sk = af_specific->get_sock(skb, th);
 948                 if(sk == NULL || !ipsec_sk_policy(sk,skb))
 949                         return 0;
 950                 skb_set_owner_r(skb, sk);
 951                 af_specific = sk->tp_pinfo.af_tcp.af_specific;
 952                 if(af_specific->conn_request(sk, skb, isn) < 0)
 953                         return 1; /* Toss a reset back. */
 954                 return 0; /* Discard the frame. */
 955         }
 956
 957         /* Check RST or SYN */
 958         if(th->rst || th->syn) {
 959                 /* This is TIME_WAIT assasination, in two flavors.
 960                  * Oh well... nobody has a sufficient solution to this
 961                  * protocol bug yet.
 962                  */
 963                 if(sysctl_tcp_rfc1337 == 0) {
 964                         tcp_tw_deschedule(tw);
 965                         tcp_timewait_kill(tw);
 966                 }
 967                 if(!th->rst)
 968                         return 1; /* toss a reset back */
 969         } else {
 970                 /* In this case we must reset the TIMEWAIT timer. */
 971                 if(th->ack)
 972                         tcp_tw_reschedule(tw);
 973         }
 974         return 0; /* Discard the frame. */
 975 }
 976
 977 /* Enter the time wait state.  This is always called from BH
 978  * context.  Essentially we whip up a timewait bucket, copy the
 979  * relevant info into it from the SK, and mess with hash chains
 980  * and list linkage.
 981  */
 982 static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 983 {
 984         struct sock **head, *sktw;
 985
 986         /* Step 1: Remove SK from established hash. */
 987         if(sk->next)
 988                 sk->next->pprev = sk->pprev;
 989         *sk->pprev = sk->next;
 990         sk->pprev = NULL;
 991         tcp_reg_zap(sk);
 992
 993         /* Step 2: Put TW into bind hash where SK was. */
 994         tw->tb = (struct tcp_bind_bucket *)sk->prev;
 995         if((tw->bind_next = sk->bind_next) != NULL)
 996                 sk->bind_next->bind_pprev = &tw->bind_next;
 997         tw->bind_pprev = sk->bind_pprev;
 998         *sk->bind_pprev = (struct sock *)tw;
 999
1000         /* Step 3: Same for the protocol sklist. */
1001         (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
1002         (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
1003         sk->sklist_next = NULL;
1004         sk->prot->inuse--;
1005
1006         /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
1007         head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
1008         sktw = (struct sock *)tw;
1009         if((sktw->next = *head) != NULL)
1010                 (*head)->pprev = &sktw->next;
1011         *head = sktw;
1012         sktw->pprev = head;
1013 }
1014
1015 void tcp_time_wait(struct sock *sk)
1016 {
1017         struct tcp_tw_bucket *tw;
1018
1019         tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1020         if(tw != NULL) {
1021                 /* Give us an identity. */
1022                 tw->daddr       = sk->daddr;
1023                 tw->rcv_saddr   = sk->rcv_saddr;
1024                 tw->bound_dev_if= sk->bound_dev_if;
1025                 tw->num         = sk->num;
1026                 tw->state       = TCP_TIME_WAIT;
1027                 tw->sport       = sk->sport;
1028                 tw->dport       = sk->dport;
1029                 tw->family      = sk->family;
1030                 tw->reuse       = sk->reuse;
1031                 tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
1032                 tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
1033
1034 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1035                 if(tw->family == PF_INET6) {
1036                         memcpy(&tw->v6_daddr,
1037                                &sk->net_pinfo.af_inet6.daddr,
1038                                sizeof(struct in6_addr));
1039                         memcpy(&tw->v6_rcv_saddr,
1040                                &sk->net_pinfo.af_inet6.rcv_saddr,
1041                                sizeof(struct in6_addr));
1042                 }
1043 #endif
1044                 /* Linkage updates. */
1045                 tcp_tw_hashdance(sk, tw);
1046
1047                 /* Get the TIME_WAIT timeout firing. */
1048                 tcp_tw_schedule(tw);
1049
1050                 /* CLOSE the SK. */
1051                 if(sk->state == TCP_ESTABLISHED)
1052                         tcp_statistics.TcpCurrEstab--;
1053                 sk->state = TCP_CLOSE;
1054                 net_reset_timer(sk, TIME_DONE,
1055                                 min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1056         } else {
1057                 /* Sorry, we're out of memory, just CLOSE this
1058                  * socket up.  We've got bigger problems than
1059                  * non-graceful socket closings.
1060                  */
1061                 tcp_set_state(sk, TCP_CLOSE);
1062         }
1063
1064         /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1065         sk->shutdown = SHUTDOWN_MASK;
1066         if(!sk->dead)
1067                 sk->state_change(sk);
1068 }
1069
1070 /*
1071  *      Process the FIN bit. This now behaves as it is supposed to work
1072  *      and the FIN takes effect when it is validly part of sequence
1073  *      space. Not before when we get holes.
1074  *
1075  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1076  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1077  *      TIME-WAIT)
1078  *
1079  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1080  *      close and we go into CLOSING (and later onto TIME-WAIT)
1081  *
1082  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1083  */
1084
1085 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1086 {
1087         sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1088
1089         tcp_send_ack(sk);
1090
1091         if (!sk->dead) {
1092                 sk->state_change(sk);
1093                 sock_wake_async(sk->socket, 1);
1094         }
1095
1096         switch(sk->state) {
1097                 case TCP_SYN_RECV:
1098                 case TCP_ESTABLISHED:
1099                         /* Move to CLOSE_WAIT */
1100                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1101                         if (th->rst)
1102                                 sk->shutdown = SHUTDOWN_MASK;
1103                         break;
1104
1105                 case TCP_CLOSE_WAIT:
1106                 case TCP_CLOSING:
1107                         /* Received a retransmission of the FIN, do
1108                          * nothing.
1109                          */
1110                         break;
1111                 case TCP_LAST_ACK:
1112                         /* RFC793: Remain in the LAST-ACK state. */
1113                         break;
1114
1115                 case TCP_FIN_WAIT1:
1116                         /* This case occurs when a simultaneous close
1117                          * happens, we must ack the received FIN and
1118                          * enter the CLOSING state.
1119                          *
1120                          * This causes a WRITE timeout, which will either
1121                          * move on to TIME_WAIT when we timeout, or resend
1122                          * the FIN properly (maybe we get rid of that annoying
1123                          * FIN lost hang). The TIME_WRITE code is already
1124                          * correct for handling this timeout.
1125                          */
1126                         tcp_set_state(sk, TCP_CLOSING);
1127                         break;
1128                 case TCP_FIN_WAIT2:
1129                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1130                         tcp_time_wait(sk);
1131                         break;
1132                 default:
1133                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1134                          * cases we should never reach this piece of code.
1135                          */
1136                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1137                         break;
1138         };
1139 }
1140
1141 /* These routines update the SACK block as out-of-order packets arrive or
1142  * in-order packets close up the sequence space.
1143  */
1144 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1145 {
1146         int this_sack, num_sacks = tp->num_sacks;
1147         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1148
1149         /* If more than one SACK block, see if the recent change to SP eats into
1150          * or hits the sequence space of other SACK blocks, if so coalesce.
1151          */
1152         if(num_sacks != 1) {
1153                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1154                         if(swalk == sp)
1155                                 continue;
1156
1157                         /* First case, bottom of SP moves into top of the
1158                          * sequence space of SWALK.
1159                          */
1160                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1161                                 sp->start_seq = swalk->start_seq;
1162                                 goto coalesce;
1163                         }
1164                         /* Second case, top of SP moves into bottom of the
1165                          * sequence space of SWALK.
1166                          */
1167                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1168                                 sp->end_seq = swalk->end_seq;
1169                                 goto coalesce;
1170                         }
1171                 }
1172         }
1173         /* SP is the only SACK, or no coalescing cases found. */
1174         return;
1175
1176 coalesce:
1177         /* Zap SWALK, by moving every further SACK up by one slot.
1178          * Decrease num_sacks.
1179          */
1180         for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1181                 struct tcp_sack_block *next = (swalk + 1);
1182                 swalk->start_seq = next->start_seq;
1183                 swalk->end_seq = next->end_seq;
1184         }
1185         tp->num_sacks--;
1186 }
1187
1188 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1189 {
1190         __u32 tmp;
1191
1192         tmp = sack1->start_seq;
1193         sack1->start_seq = sack2->start_seq;
1194         sack2->start_seq = tmp;
1195
1196         tmp = sack1->end_seq;
1197         sack1->end_seq = sack2->end_seq;
1198         sack2->end_seq = tmp;
1199 }
1200
1201 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1202 {
1203         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1204         struct tcp_sack_block *sp = &tp->selective_acks[0];
1205         int cur_sacks = tp->num_sacks;
1206
1207         if (!cur_sacks)
1208                 goto new_sack;
1209
1210         /* Optimize for the common case, new ofo frames arrive
1211          * "in order". ;-)  This also satisfies the requirements
1212          * of RFC2018 about ordering of SACKs.
1213          */
1214         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1215                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1216                 tcp_sack_maybe_coalesce(tp, sp);
1217         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1218                 /* Re-ordered arrival, in this case, can be optimized
1219                  * as well.
1220                  */
1221                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1222                 tcp_sack_maybe_coalesce(tp, sp);
1223         } else {
1224                 struct tcp_sack_block *swap = sp + 1;
1225                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1226
1227                 /* Oh well, we have to move things around.
1228                  * Try to find a SACK we can tack this onto.
1229                  */
1230
1231                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1232                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1233                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1234                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1235                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1236                                 else
1237                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1238                                 tcp_sack_swap(sp, swap);
1239                                 tcp_sack_maybe_coalesce(tp, sp);
1240                                 return;
1241                         }
1242                 }
1243
1244                 /* Could not find an adjacent existing SACK, build a new one,
1245                  * put it at the front, and shift everyone else down.  We
1246                  * always know there is at least one SACK present already here.
1247                  *
1248                  * If the sack array is full, forget about the last one.
1249                  */
1250                 if (cur_sacks >= max_sacks) {
1251                         cur_sacks--;
1252                         tp->num_sacks--;
1253                 }
1254                 while(cur_sacks >= 1) {
1255                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1256                         struct tcp_sack_block *prev = (this - 1);
1257                         this->start_seq = prev->start_seq;
1258                         this->end_seq = prev->end_seq;
1259                         cur_sacks--;
1260                 }
1261
1262         new_sack:
1263                 /* Build the new head SACK, and we're done. */
1264                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1265                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1266                 tp->num_sacks++;
1267         }
1268 }
1269
1270 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1271 {
1272         struct tcp_sack_block *sp = &tp->selective_acks[0];
1273         int num_sacks = tp->num_sacks;
1274         int this_sack;
1275
1276         /* This is an in order data segment _or_ an out-of-order SKB being
1277          * moved to the receive queue, so we know this removed SKB will eat
1278          * from the front of a SACK.
1279          */
1280         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1281                 /* Check if the start of the sack is covered by skb. */
1282                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1283                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1284                         break;
1285         }
1286
1287         /* This should only happen if so many SACKs get built that some get
1288          * pushed out before we get here, or we eat some in sequence packets
1289          * which are before the first SACK block.
1290          */
1291         if(this_sack >= num_sacks)
1292                 return;
1293
1294         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1295         if(!before(sp->start_seq, sp->end_seq)) {
1296                 /* Zap this SACK, by moving forward any other SACKS. */
1297                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1298                         struct tcp_sack_block *next = (sp + 1);
1299                         sp->start_seq = next->start_seq;
1300                         sp->end_seq = next->end_seq;
1301                 }
1302                 tp->num_sacks--;
1303         }
1304 }
1305
1306 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1307 {
1308         struct tcp_sack_block *sp = &tp->selective_acks[0];
1309         int num_sacks = tp->num_sacks;
1310         int this_sack;
1311
1312         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1313                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1314                         break;
1315         }
1316         if(this_sack >= num_sacks)
1317                 return;
1318         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1319 }
1320
1321 /* This one checks to see if we can put data from the
1322  * out_of_order queue into the receive_queue.
1323  */
1324 static void tcp_ofo_queue(struct sock *sk)
1325 {
1326         struct sk_buff *skb;
1327         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1328
1329         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1330                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1331                         break;
1332
1333                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1334                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1335                         __skb_unlink(skb, skb->list);
1336                         kfree_skb(skb);
1337                         continue;
1338                 }
1339                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1340                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1341                            TCP_SKB_CB(skb)->end_seq);
1342
1343                 if(tp->sack_ok)
1344                         tcp_sack_remove_skb(tp, skb);
1345                 __skb_unlink(skb, skb->list);
1346                 __skb_queue_tail(&sk->receive_queue, skb);
1347                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348                 if(skb->h.th->fin)
1349                         tcp_fin(skb, sk, skb->h.th);
1350         }
1351 }
1352
1353 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1354 {
1355         struct sk_buff *skb1;
1356         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1357
1358         /*  Queue data for delivery to the user.
1359          *  Packets in sequence go to the receive queue.
1360          *  Out of sequence packets to the out_of_order_queue.
1361          */
1362         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1363                 /* Ok. In sequence. */
1364         queue_and_out:
1365                 dst_confirm(sk->dst_cache);
1366                 __skb_queue_tail(&sk->receive_queue, skb);
1367                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1368                 if(skb->h.th->fin) {
1369                         tcp_fin(skb, sk, skb->h.th);
1370                 } else {
1371                         tcp_remember_ack(tp, skb->h.th, skb);
1372                 }
1373                 /* This may have eaten into a SACK block. */
1374                 if(tp->sack_ok && tp->num_sacks)
1375                         tcp_sack_remove_skb(tp, skb);
1376                 tcp_ofo_queue(sk);
1377
1378                 /* Turn on fast path. */
1379                 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1380                         tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1381                                                (0x10 << 16) |
1382                                                tp->snd_wnd);
1383                 return;
1384         }
1385
1386         /* An old packet, either a retransmit or some packet got lost. */
1387         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1388                 /* A retransmit, 2nd most common case.  Force an imediate ack. */
1389                 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1390                 tcp_enter_quickack_mode(tp);
1391                 kfree_skb(skb);
1392                 return;
1393         }
1394
1395         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1396                 /* Partial packet, seq < rcv_next < end_seq */
1397                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1398                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1399                            TCP_SKB_CB(skb)->end_seq);
1400
1401                 goto queue_and_out;
1402         }
1403
1404         /* Ok. This is an out_of_order segment, force an ack. */
1405         tp->delayed_acks++;
1406         tcp_enter_quickack_mode(tp);
1407
1408         /* Disable header prediction. */
1409         tp->pred_flags = 0;
1410
1411         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1412                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1413
1414         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1415                 /* Initial out of order segment, build 1 SACK. */
1416                 if(tp->sack_ok) {
1417                         tp->num_sacks = 1;
1418                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1419                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1420                 }
1421                 __skb_queue_head(&tp->out_of_order_queue,skb);
1422         } else {
1423                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1424                         /* Already there. */
1425                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1426                                 if (skb->len >= skb1->len) {
1427                                         if(tp->sack_ok)
1428                                                 tcp_sack_extend(tp, skb1, skb);
1429                                         __skb_append(skb1, skb);
1430                                         __skb_unlink(skb1, skb1->list);
1431                                         kfree_skb(skb1);
1432                                 } else {
1433                                         /* A duplicate, smaller than what is in the
1434                                          * out-of-order queue right now, toss it.
1435                                          */
1436                                         kfree_skb(skb);
1437                                 }
1438                                 break;
1439                         }
1440
1441                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1442                                 __skb_append(skb1, skb);
1443                                 if(tp->sack_ok)
1444                                         tcp_sack_new_ofo_skb(sk, skb);
1445                                 break;
1446                         }
1447
1448                         /* See if we've hit the start. If so insert. */
1449                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1450                                 __skb_queue_head(&tp->out_of_order_queue,skb);
1451                                 if(tp->sack_ok)
1452                                         tcp_sack_new_ofo_skb(sk, skb);
1453                                 break;
1454                         }
1455                 }
1456         }
1457 }
1458
1459
1460 /*
1461  *      This routine handles the data.  If there is room in the buffer,
1462  *      it will be have already been moved into it.  If there is no
1463  *      room, then we will just have to discard the packet.
1464  */
1465
1466 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1467 {
1468         struct tcphdr *th;
1469         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1470
1471         th = skb->h.th;
1472         skb_pull(skb, th->doff*4);
1473         skb_trim(skb, len - (th->doff*4));
1474
1475         if (skb->len == 0 && !th->fin)
1476                 return(0);
1477
1478         /*
1479          *      If our receive queue has grown past its limits shrink it.
1480          *      Make sure to do this before moving snd_nxt, otherwise
1481          *      data might be acked for that we don't have enough room.
1482          */
1483         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1484                 if (prune_queue(sk) < 0) {
1485                         /* Still not enough room. That can happen when
1486                          * skb->true_size differs significantly from skb->len.
1487                          */
1488                         return 0;
1489                 }
1490         }
1491
1492         tcp_data_queue(sk, skb);
1493
1494         if (before(tp->rcv_nxt, tp->copied_seq)) {
1495                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1496                 tp->rcv_nxt = tp->copied_seq;
1497         }
1498
1499         /* Above, tcp_data_queue() increments delayed_acks appropriately.
1500          * Now tell the user we may have some data.
1501          */
1502         if (!sk->dead) {
1503                 SOCK_DEBUG(sk, "Data wakeup.\n");
1504                 sk->data_ready(sk,0);
1505         }
1506         return(1);
1507 }
1508
1509 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1510 {
1511         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1512
1513         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1514             tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1515                 /* Put more data onto the wire. */
1516                 tcp_write_xmit(sk);
1517         } else if (tp->packets_out == 0 && !tp->pending) {
1518                 /* Start probing the receivers window. */
1519                 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1520         }
1521 }
1522
1523 static __inline__ void tcp_data_snd_check(struct sock *sk)
1524 {
1525         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1526
1527         if (skb != NULL)
1528                 __tcp_data_snd_check(sk, skb);
1529 }
1530
1531 /*
1532  * Adapt the MSS value used to make delayed ack decision to the
1533  * real world.
1534  */
1535 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1536 {
1537         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1538         unsigned int len = skb->len, lss;
1539
1540         if (len > tp->rcv_mss)
1541                 tp->rcv_mss = len;
1542         lss = tp->last_seg_size;
1543         tp->last_seg_size = 0;
1544         if (len >= 536) {
1545                 if (len == lss)
1546                         tp->rcv_mss = len;
1547                 tp->last_seg_size = len;
1548         }
1549 }
1550
1551 /*
1552  * Check if sending an ack is needed.
1553  */
1554 static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1555 {
1556         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1557
1558         /* This also takes care of updating the window.
1559          * This if statement needs to be simplified.
1560          *
1561          * Rules for delaying an ack:
1562          *      - delay time <= 0.5 HZ
1563          *      - we don't have a window update to send
1564          *      - must send at least every 2 full sized packets
1565          *      - must send an ACK if we have any out of order data
1566          *
1567          * With an extra heuristic to handle loss of packet
1568          * situations and also helping the sender leave slow
1569          * start in an expediant manner.
1570          */
1571
1572             /* Two full frames received or... */
1573         if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1574             /* We will update the window "significantly" or... */
1575             tcp_raise_window(sk) ||
1576             /* We entered "quick ACK" mode or... */
1577             tcp_in_quickack_mode(tp) ||
1578             /* We have out of order data */
1579             (skb_peek(&tp->out_of_order_queue) != NULL)) {
1580                 /* Then ack it now */
1581                 tcp_send_ack(sk);
1582         } else {
1583                 /* Else, send delayed ack. */
1584                 tcp_send_delayed_ack(tp, HZ/2);
1585         }
1586 }
1587
1588 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1589 {
1590         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1591         if (tp->delayed_acks == 0) {
1592                 /* We sent a data segment already. */
1593                 return;
1594         }
1595         __tcp_ack_snd_check(sk);
1596 }
1597
1598
1599 /*
1600  *      This routine is only called when we have urgent data
1601  *      signalled. Its the 'slow' part of tcp_urg. It could be
1602  *      moved inline now as tcp_urg is only called from one
1603  *      place. We handle URGent data wrong. We have to - as
1604  *      BSD still doesn't use the correction from RFC961.
1605  *      For 1003.1g we should support a new option TCP_STDURG to permit
1606  *      either form (or just set the sysctl tcp_stdurg).
1607  */
1608
1609 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1610 {
1611         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1612         u32 ptr = ntohs(th->urg_ptr);
1613
1614         if (ptr && !sysctl_tcp_stdurg)
1615                 ptr--;
1616         ptr += ntohl(th->seq);
1617
1618         /* Ignore urgent data that we've already seen and read. */
1619         if (after(tp->copied_seq, ptr))
1620                 return;
1621
1622         /* Do we already have a newer (or duplicate) urgent pointer? */
1623         if (tp->urg_data && !after(ptr, tp->urg_seq))
1624                 return;
1625
1626         /* Tell the world about our new urgent pointer. */
1627         if (sk->proc != 0) {
1628                 if (sk->proc > 0)
1629                         kill_proc(sk->proc, SIGURG, 1);
1630                 else
1631                         kill_pg(-sk->proc, SIGURG, 1);
1632         }
1633
1634         /* We may be adding urgent data when the last byte read was
1635          * urgent. To do this requires some care. We cannot just ignore
1636          * tp->copied_seq since we would read the last urgent byte again
1637          * as data, nor can we alter copied_seq until this data arrives
1638          * or we break the sematics of SIOCATMARK (and thus sockatmark())
1639          */
1640         if (tp->urg_seq == tp->copied_seq)
1641                 tp->copied_seq++;       /* Move the copied sequence on correctly */
1642         tp->urg_data = URG_NOTYET;
1643         tp->urg_seq = ptr;
1644
1645         /* Disable header prediction. */
1646         tp->pred_flags = 0;
1647 }
1648
1649 /* This is the 'fast' part of urgent handling. */
1650 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1651 {
1652         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1653
1654         /* Check if we get a new urgent pointer - normally not. */
1655         if (th->urg)
1656                 tcp_check_urg(sk,th);
1657
1658         /* Do we wait for any urgent data? - normally not... */
1659         if (tp->urg_data == URG_NOTYET) {
1660                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1661
1662                 /* Is the urgent pointer pointing into this packet? */
1663                 if (ptr < len) {
1664                         tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1665                         if (!sk->dead)
1666                                 sk->data_ready(sk,0);
1667                 }
1668         }
1669 }
1670
1671 /* Clean the out_of_order queue if we can, trying to get
1672  * the socket within its memory limits again.
1673  *
1674  * Return less than zero if we should start dropping frames
1675  * until the socket owning process reads some of the data
1676  * to stabilize the situation.
1677  */
1678 static int prune_queue(struct sock *sk)
1679 {
1680         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1681         struct sk_buff * skb;
1682
1683         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1684
1685         net_statistics.PruneCalled++;
1686
1687         /* First, purge the out_of_order queue. */
1688         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1689         if(skb != NULL) {
1690                 /* Free it all. */
1691                 do {    net_statistics.OfoPruned += skb->len;
1692                         kfree_skb(skb);
1693                         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1694                 } while(skb != NULL);
1695
1696                 /* Reset SACK state.  A conforming SACK implementation will
1697                  * do the same at a timeout based retransmit.  When a connection
1698                  * is in a sad state like this, we care only about integrity
1699                  * of the connection not performance.
1700                  */
1701                 if(tp->sack_ok)
1702                         tp->num_sacks = 0;
1703         }
1704
1705         /* If we are really being abused, tell the caller to silently
1706          * drop receive data on the floor.  It will get retransmitted
1707          * and hopefully then we'll have sufficient space.
1708          *
1709          * We used to try to purge the in-order packets too, but that
1710          * turns out to be deadly and fraught with races.  Consider:
1711          *
1712          * 1) If we acked the data, we absolutely cannot drop the
1713          *    packet.  This data would then never be retransmitted.
1714          * 2) It is possible, with a proper sequence of events involving
1715          *    delayed acks and backlog queue handling, to have the user
1716          *    read the data before it gets acked.  The previous code
1717          *    here got this wrong, and it lead to data corruption.
1718          * 3) Too much state changes happen when the FIN arrives, so once
1719          *    we've seen that we can't remove any in-order data safely.
1720          *
1721          * The net result is that removing in-order receive data is too
1722          * complex for anyones sanity.  So we don't do it anymore.  But
1723          * if we are really having our buffer space abused we stop accepting
1724          * new receive data.
1725          */
1726         if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
1727                 return 0;
1728
1729         /* Massive buffer overcommit. */
1730         return -1;
1731 }
1732
1733 /*
1734  *      TCP receive function for the ESTABLISHED state.
1735  *
1736  *      It is split into a fast path and a slow path. The fast path is
1737  *      disabled when:
1738  *      - A zero window was announced from us - zero window probing
1739  *        is only handled properly in the slow path.
1740  *      - Out of order segments arrived.
1741  *      - Urgent data is expected.
1742  *      - There is no buffer space left
1743  *      - Unexpected TCP flags/window values/header lengths are received
1744  *        (detected by checking the TCP header against pred_flags)
1745  *      - Data is sent in both directions. Fast path only supports pure senders
1746  *        or pure receivers (this means either the sequence number or the ack
1747  *        value must stay constant)
1748  *
1749  *      When these conditions are not satisfied it drops into a standard
1750  *      receive procedure patterned after RFC793 to handle all cases.
1751  *      The first three cases are guaranteed by proper pred_flags setting,
1752  *      the rest is checked inline. Fast processing is turned on in
1753  *      tcp_data_queue when everything is OK.
1754  */
1755 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1756                         struct tcphdr *th, unsigned len)
1757 {
1758         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1759         int queued;
1760         u32 flg;
1761
1762         /*
1763          *      Header prediction.
1764          *      The code follows the one in the famous
1765          *      "30 instruction TCP receive" Van Jacobson mail.
1766          *
1767          *      Van's trick is to deposit buffers into socket queue
1768          *      on a device interrupt, to call tcp_recv function
1769          *      on the receive process context and checksum and copy
1770          *      the buffer to user space. smart...
1771          *
1772          *      Our current scheme is not silly either but we take the
1773          *      extra cost of the net_bh soft interrupt processing...
1774          *      We do checksum and copy also but from device to kernel.
1775          */
1776
1777         /*
1778          * RFC1323: H1. Apply PAWS check first.
1779          */
1780         if (tcp_fast_parse_options(sk, th, tp)) {
1781                 if (tp->saw_tstamp) {
1782                         if (tcp_paws_discard(tp, th, len)) {
1783                                 tcp_statistics.TcpInErrs++;
1784                                 if (!th->rst) {
1785                                         tcp_send_ack(sk);
1786                                         goto discard;
1787                                 }
1788                         }
1789                         tcp_replace_ts_recent(sk, tp,
1790                                               TCP_SKB_CB(skb)->seq,
1791                                               TCP_SKB_CB(skb)->end_seq);
1792                 }
1793         }
1794
1795         flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1796
1797         /*      pred_flags is 0xS?10 << 16 + snd_wnd
1798          *      if header_predition is to be made
1799          *      'S' will always be tp->tcp_header_len >> 2
1800          *      '?' will be 0 else it will be !0
1801          *      (when there are holes in the receive
1802          *       space for instance)
1803          *      PSH flag is ignored.
1804          */
1805
1806         if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1807                 if (len <= th->doff*4) {
1808                         /* Bulk data transfer: sender */
1809                         if (len == th->doff*4) {
1810                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1811                                         TCP_SKB_CB(skb)->ack_seq, len);
1812                                 kfree_skb(skb);
1813                                 tcp_data_snd_check(sk);
1814                                 return 0;
1815                         } else { /* Header too small */
1816                                 tcp_statistics.TcpInErrs++;
1817                                 goto discard;
1818                         }
1819                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1820                            atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1821                         /* Bulk data transfer: receiver */
1822                         __skb_pull(skb,th->doff*4);
1823
1824                         tcp_measure_rcv_mss(sk, skb);
1825
1826                         /* DO NOT notify forward progress here.
1827                          * It saves dozen of CPU instructions in fast path. --ANK
1828                          */
1829                         __skb_queue_tail(&sk->receive_queue, skb);
1830                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1831
1832                         /* FIN bit check is not done since if FIN is set in
1833                          * this frame, the pred_flags won't match up. -DaveM
1834                          */
1835                         sk->data_ready(sk, 0);
1836                         tcp_delack_estimator(tp);
1837
1838                         tcp_remember_ack(tp, th, skb);
1839
1840                         __tcp_ack_snd_check(sk);
1841                         return 0;
1842                 }
1843         }
1844
1845         /*
1846          *      Standard slow path.
1847          */
1848
1849         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1850                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
1851                  * (RST) segments are validated by checking their SEQ-fields."
1852                  * And page 69: "If an incoming segment is not acceptable,
1853                  * an acknowledgment should be sent in reply (unless the RST bit
1854                  * is set, if so drop the segment and return)".
1855                  */
1856                 if (th->rst)
1857                         goto discard;
1858                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1859                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1860                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1861                                    tp->rcv_wup, tp->rcv_wnd);
1862                 }
1863                 tcp_send_ack(sk);
1864                 goto discard;
1865         }
1866
1867         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1868                 SOCK_DEBUG(sk, "syn in established state\n");
1869                 tcp_statistics.TcpInErrs++;
1870                 tcp_reset(sk);
1871                 return 1;
1872         }
1873
1874         if(th->rst) {
1875                 tcp_reset(sk);
1876                 goto discard;
1877         }
1878
1879         if(th->ack)
1880                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1881
1882         /* Process urgent data. */
1883         tcp_urg(sk, th, len);
1884
1885         /* step 7: process the segment text */
1886         queued = tcp_data(skb, sk, len);
1887
1888         /* This must be after tcp_data() does the skb_pull() to
1889          * remove the header size from skb->len.
1890          *
1891          * Dave!!! Phrase above (and all about rcv_mss) has
1892          * nothing to do with reality. rcv_mss must measure TOTAL
1893          * size, including sacks, IP options etc. Hence, measure_rcv_mss
1894          * must occure before pulling etc, otherwise it will flap
1895          * like hell. Even putting it before tcp_data is wrong,
1896          * it should use skb->tail - skb->nh.raw instead.
1897          *                                      --ANK (980805)
1898          *
1899          * BTW I broke it. Now all TCP options are handled equally
1900          * in mss_clamp calculations (i.e. ignored, rfc1122),
1901          * and mss_cache does include all of them (i.e. tstamps)
1902          * except for sacks, to calulate effective mss faster.
1903          *                                      --ANK (980805)
1904          */
1905         tcp_measure_rcv_mss(sk, skb);
1906
1907         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1908         if(sk->state != TCP_CLOSE) {
1909                 tcp_data_snd_check(sk);
1910                 tcp_ack_snd_check(sk);
1911         }
1912
1913         if (!queued) {
1914         discard:
1915                 kfree_skb(skb);
1916         }
1917
1918         return 0;
1919 }
1920
1921 /*
1922  *      Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1923  *      as an open_request.
1924  */
1925
1926 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
1927                            struct open_request *req)
1928 {
1929         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1930         u32 flg;
1931
1932         /*      assumption: the socket is not in use.
1933          *      as we checked the user count on tcp_rcv and we're
1934          *      running from a soft interrupt.
1935          */
1936
1937         /* Check for syn retransmission */
1938         flg = *(((u32 *)skb->h.th) + 3);
1939
1940         flg &= __constant_htonl(0x00170000);
1941         /* Only SYN set? */
1942         if (flg == __constant_htonl(0x00020000)) {
1943                 if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1944                         /*      retransmited syn.
1945                          */
1946                         req->class->rtx_syn_ack(sk, req);
1947                         return NULL;
1948                 } else {
1949                         return sk; /* Pass new SYN to the listen socket. */
1950                 }
1951         }
1952
1953         /* We know it's an ACK here */
1954         if (req->sk) {
1955                 /*      socket already created but not
1956                  *      yet accepted()...
1957                  */
1958                 sk = req->sk;
1959         } else {
1960                 /* In theory the packet could be for a cookie, but
1961                  * TIME_WAIT should guard us against this.
1962                  * XXX: Nevertheless check for cookies?
1963                  * This sequence number check is done again later,
1964                  * but we do it here to prevent syn flood attackers
1965                  * from creating big SYN_RECV sockets.
1966                  */
1967                 if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
1968                     !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
1969                              req->rcv_isn+1+req->rcv_wnd)) {
1970                         req->class->send_reset(skb);
1971                         return NULL;
1972                 }
1973
1974                 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1975                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1976                 if (sk == NULL)
1977                         return NULL;
1978
1979                 req->expires = 0UL;
1980                 req->sk = sk;
1981         }
1982         skb_orphan(skb);
1983         skb_set_owner_r(skb, sk);
1984         return sk;
1985 }
1986
1987 /*
1988  *      This function implements the receiving procedure of RFC 793 for
1989  *      all states except ESTABLISHED and TIME_WAIT.
1990  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
1991  *      address independent.
1992  */
1993
1994 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
1995                           struct tcphdr *th, unsigned len)
1996 {
1997         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1998         int queued = 0;
1999
2000         switch (sk->state) {
2001         case TCP_CLOSE:
2002                 /* When state == CLOSED, hash lookup always fails.
2003                  *
2004                  * But, there is a back door, the backlog queue.
2005                  * If we have a sequence of packets in the backlog
2006                  * during __release_sock() which have a sequence such
2007                  * that:
2008                  *      packet X        causes entry to TCP_CLOSE state
2009                  *      ...
2010                  *      packet X + N    has FIN bit set
2011                  *
2012                  * We report a (luckily) harmless error in this case.
2013                  * The issue is that backlog queue processing bypasses
2014                  * any hash lookups (we know which socket packets are for).
2015                  * The correct behavior here is what 2.0.x did, since
2016                  * a TCP_CLOSE socket does not exist.  Drop the frame
2017                  * and send a RST back to the other end.
2018                  */
2019                 return 1;
2020
2021         case TCP_LISTEN:
2022                 /* These use the socket TOS..
2023                  * might want to be the received TOS
2024                  */
2025                 if(th->ack)
2026                         return 1;
2027
2028                 if(th->syn) {
2029                         if(tp->af_specific->conn_request(sk, skb, 0) < 0)
2030                                 return 1;
2031
2032                         /* Now we have several options: In theory there is
2033                          * nothing else in the frame. KA9Q has an option to
2034                          * send data with the syn, BSD accepts data with the
2035                          * syn up to the [to be] advertised window and
2036                          * Solaris 2.1 gives you a protocol error. For now
2037                          * we just ignore it, that fits the spec precisely
2038                          * and avoids incompatibilities. It would be nice in
2039                          * future to drop through and process the data.
2040                          *
2041                          * Now that TTCP is starting to be used we ought to
2042                          * queue this data.
2043                          * But, this leaves one open to an easy denial of
2044                          * service attack, and SYN cookies can't defend
2045                          * against this problem. So, we drop the data
2046                          * in the interest of security over speed.
2047                          */
2048                         goto discard;
2049                 }
2050
2051                 goto discard;
2052                 break;
2053
2054         case TCP_SYN_SENT:
2055                 /* SYN sent means we have to look for a suitable ack and
2056                  * either reset for bad matches or go to connected.
2057                  * The SYN_SENT case is unusual and should
2058                  * not be in line code. [AC]
2059                  */
2060                 if(th->ack) {
2061                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2062
2063                         /* We got an ack, but it's not a good ack. */
2064                         if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2065                                     TCP_SKB_CB(skb)->ack_seq, len))
2066                                 return 1;
2067
2068                         if(th->rst) {
2069                                 tcp_reset(sk);
2070                                 goto discard;
2071                         }
2072
2073                         if(!th->syn)
2074                                 goto discard;
2075
2076                         /* Ok.. it's good. Set up sequence numbers and
2077                          * move to established.
2078                          */
2079                         tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2080                         tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2081
2082                         /* RFC1323: The window in SYN & SYN/ACK segments is
2083                          * never scaled.
2084                          */
2085                         tp->snd_wnd = htons(th->window);
2086                         tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2087                         tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2088                         tp->fin_seq = TCP_SKB_CB(skb)->seq;
2089
2090                         tcp_set_state(sk, TCP_ESTABLISHED);
2091                         tcp_parse_options(sk, th, tp, 0);
2092
2093                         if (tp->wscale_ok == 0) {
2094                                 tp->snd_wscale = tp->rcv_wscale = 0;
2095                                 tp->window_clamp = min(tp->window_clamp,65535);
2096                         }
2097
2098                         if (tp->tstamp_ok) {
2099                                 tp->tcp_header_len =
2100                                         sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2101                         } else
2102                                 tp->tcp_header_len = sizeof(struct tcphdr);
2103                         if (tp->saw_tstamp) {
2104                                 tp->ts_recent = tp->rcv_tsval;
2105                                 tp->ts_recent_stamp = jiffies;
2106                         }
2107
2108                         /* Can't be earlier, doff would be wrong. */
2109                         tcp_send_ack(sk);
2110
2111                         sk->dport = th->source;
2112                         tp->copied_seq = tp->rcv_nxt;
2113
2114                         if(!sk->dead) {
2115                                 sk->state_change(sk);
2116                                 sock_wake_async(sk->socket, 0);
2117                         }
2118                 } else {
2119                         if(th->syn && !th->rst) {
2120                                 /* The previous version of the code
2121                                  * checked for "connecting to self"
2122                                  * here. that check is done now in
2123                                  * tcp_connect.
2124                                  */
2125                                 tcp_set_state(sk, TCP_SYN_RECV);
2126                                 tcp_parse_options(sk, th, tp, 0);
2127                                 if (tp->saw_tstamp) {
2128                                         tp->ts_recent = tp->rcv_tsval;
2129                                         tp->ts_recent_stamp = jiffies;
2130                                 }
2131
2132                                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2133                                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2134
2135                                 /* RFC1323: The window in SYN & SYN/ACK segments is
2136                                  * never scaled.
2137                                  */
2138                                 tp->snd_wnd = htons(th->window);
2139                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2140
2141                                 tcp_send_synack(sk);
2142                         } else
2143                                 break;
2144                 }
2145
2146                 /* tp->tcp_header_len and tp->mss_clamp
2147                    probably changed, synchronize mss.
2148                    */
2149                 tcp_sync_mss(sk, tp->pmtu_cookie);
2150                 tp->rcv_mss = tp->mss_cache;
2151
2152                 if (sk->state == TCP_SYN_RECV)
2153                         goto discard;
2154
2155                 goto step6;
2156         }
2157
2158         /*   Parse the tcp_options present on this header.
2159          *   By this point we really only expect timestamps.
2160          *   Note that this really has to be here and not later for PAWS
2161          *   (RFC1323) to work.
2162          */
2163         if (tcp_fast_parse_options(sk, th, tp)) {
2164                 /* NOTE: assumes saw_tstamp is never set if we didn't
2165                  * negotiate the option. tcp_fast_parse_options() must
2166                  * guarantee this.
2167                  */
2168                 if (tp->saw_tstamp) {
2169                         if (tcp_paws_discard(tp, th, len)) {
2170                                 tcp_statistics.TcpInErrs++;
2171                                 if (!th->rst) {
2172                                         tcp_send_ack(sk);
2173                                         goto discard;
2174                                 }
2175                         }
2176                         tcp_replace_ts_recent(sk, tp,
2177                                               TCP_SKB_CB(skb)->seq,
2178                                               TCP_SKB_CB(skb)->end_seq);
2179                 }
2180         }
2181
2182         /* step 1: check sequence number */
2183         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2184                 if (!th->rst) {
2185                         tcp_send_ack(sk);
2186                         goto discard;
2187                 }
2188         }
2189
2190         /* step 2: check RST bit */
2191         if(th->rst) {
2192                 tcp_reset(sk);
2193                 goto discard;
2194         }
2195
2196         /* step 3: check security and precedence [ignored] */
2197
2198         /*      step 4:
2199          *
2200          *      Check for a SYN, and ensure it matches the SYN we were
2201          *      first sent. We have to handle the rather unusual (but valid)
2202          *      sequence that KA9Q derived products may generate of
2203          *
2204          *      SYN
2205          *                              SYN|ACK Data
2206          *      ACK     (lost)
2207          *                              SYN|ACK Data + More Data
2208          *      .. we must ACK not RST...
2209          *
2210          *      We keep syn_seq as the sequence space occupied by the
2211          *      original syn.
2212          */
2213
2214         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2215                 tcp_reset(sk);
2216                 return 1;
2217         }
2218
2219         /* step 5: check the ACK field */
2220         if (th->ack) {
2221                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2222                                          TCP_SKB_CB(skb)->ack_seq, len);
2223
2224                 switch(sk->state) {
2225                 case TCP_SYN_RECV:
2226                         if (acceptable) {
2227                                 tcp_set_state(sk, TCP_ESTABLISHED);
2228                                 sk->dport = th->source;
2229                                 tp->copied_seq = tp->rcv_nxt;
2230
2231                                 if(!sk->dead)
2232                                         sk->state_change(sk);
2233
2234                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2235                                 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2236                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2237                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2238
2239                         } else {
2240                                 SOCK_DEBUG(sk, "bad ack\n");
2241                                 return 1;
2242                         }
2243                         break;
2244
2245                 case TCP_FIN_WAIT1:
2246                         if (tp->snd_una == tp->write_seq) {
2247                                 sk->shutdown |= SEND_SHUTDOWN;
2248                                 tcp_set_state(sk, TCP_FIN_WAIT2);
2249                                 if (!sk->dead)
2250                                         sk->state_change(sk);
2251                                 else
2252                                         tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2253                         }
2254                         break;
2255
2256                 case TCP_CLOSING:
2257                         if (tp->snd_una == tp->write_seq) {
2258                                 tcp_time_wait(sk);
2259                                 goto discard;
2260                         }
2261                         break;
2262
2263                 case TCP_LAST_ACK:
2264                         if (tp->snd_una == tp->write_seq) {
2265                                 sk->shutdown = SHUTDOWN_MASK;
2266                                 tcp_set_state(sk,TCP_CLOSE);
2267                                 if (!sk->dead)
2268                                         sk->state_change(sk);
2269                                 goto discard;
2270                         }
2271                         break;
2272                 }
2273         } else
2274                 goto discard;
2275
2276 step6:
2277         /* step 6: check the URG bit */
2278         tcp_urg(sk, th, len);
2279
2280         /* step 7: process the segment text */
2281         switch (sk->state) {
2282         case TCP_CLOSE_WAIT:
2283         case TCP_CLOSING:
2284                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2285                         break;
2286
2287         case TCP_FIN_WAIT1:
2288         case TCP_FIN_WAIT2:
2289                 /* RFC 793 says to queue data in these states,
2290                  * RFC 1122 says we MUST send a reset.
2291                  * BSD 4.4 also does reset.
2292                  */
2293                 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2294                         if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2295                                 tcp_reset(sk);
2296                                 return 1;
2297                         }
2298                 }
2299
2300         case TCP_ESTABLISHED:
2301                 queued = tcp_data(skb, sk, len);
2302
2303                 /* This must be after tcp_data() does the skb_pull() to
2304                  * remove the header size from skb->len.
2305                  */
2306                 tcp_measure_rcv_mss(sk, skb);
2307                 break;
2308         }
2309
2310         tcp_data_snd_check(sk);
2311         tcp_ack_snd_check(sk);
2312
2313         if (!queued) {
2314 discard:
2315                 kfree_skb(skb);
2316         }
2317         return 0;
2318 }