net/ipv4/tcp_input.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:
  25  *              Pedro Roque     :       Fast Retransmit/Recovery.
  26  *                                      Two receive queues.
  27  *                                      Retransmit queue handled by TCP.
  28  *                                      Better retransmit timer handling.
  29  *                                      New congestion avoidance.
  30  *                                      Header prediction.
  31  *                                      Variable renaming.
  32  *
  33  *              Eric            :       Fast Retransmit.
  34  *              Randy Scott     :       MSS option defines.
  35  *              Eric Schenk     :       Fixes to slow start algorithm.
  36  *              Eric Schenk     :       Yet another double ACK bug.
  37  *              Eric Schenk     :       Delayed ACK bug fixes.
  38  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39  *              David S. Miller :       Don't allow zero congestion window.
  40  *              Eric Schenk     :       Fix retransmitter so that it sends
  41  *                                      next packet on ack of previous packet.
  42  *              Andi Kleen      :       Moved open_request checking here
  43  *                                      and process RSTs for open_requests.
  44  *              Andi Kleen      :       Better prune_queue, and other fixes.
  45  *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46  *                                      timestamps.
  47  *              Andrey Savochkin:       Check sequence numbers correctly when
  48  *                                      removing SACKs due to in sequence incoming
  49  *                                      data segments.
  50  *              Andi Kleen:             Make sure we never ack data there is not
  51  *                                      enough room for. Also make this condition
  52  *                                      a fatal error if it might still happen.
  53  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  54  *                                      connections with MSS<min(MTU,ann. MSS)
  55  *                                      work without delayed acks.
  56  *              Andi Kleen:             Process packets with PSH set in the
  57  *                                      fast path.
  58  */
  59
  60 #include <linux/config.h>
  61 #include <linux/mm.h>
  62 #include <linux/sysctl.h>
  63 #include <net/tcp.h>
  64 #include <net/inet_common.h>
  65 #include <linux/ipsec.h>
  66
  67 #ifdef CONFIG_SYSCTL
  68 #define SYNC_INIT 0 /* let the user enable it */
  69 #else
  70 #define SYNC_INIT 1
  71 #endif
  72
  73 extern int sysctl_tcp_fin_timeout;
  74 extern int sysctl_tcp_keepalive_time;
  75
  76 /* These are on by default so the code paths get tested.
  77  * For the final 2.2 this may be undone at our discretion. -DaveM
  78  */
  79 int sysctl_tcp_timestamps = 1;
  80 int sysctl_tcp_window_scaling = 1;
  81 int sysctl_tcp_sack = 1;
  82
  83 int sysctl_tcp_syncookies = SYNC_INIT;
  84 int sysctl_tcp_stdurg;
  85 int sysctl_tcp_rfc1337;
  86 int sysctl_tcp_tw_recycle;
  87
  88 static int prune_queue(struct sock *sk);
  89
  90 /* There is something which you must keep in mind when you analyze the
  91  * behavior of the tp->ato delayed ack timeout interval.  When a
  92  * connection starts up, we want to ack as quickly as possible.  The
  93  * problem is that "good" TCP's do slow start at the beginning of data
  94  * transmission.  The means that until we send the first few ACK's the
  95  * sender will sit on his end and only queue most of his data, because
  96  * he can only send snd_cwnd unacked packets at any given time.  For
  97  * each ACK we send, he increments snd_cwnd and transmits more of his
  98  * queue.  -DaveM
  99  */
 100 static void tcp_delack_estimator(struct tcp_opt *tp)
 101 {
 102         if(tp->ato == 0) {
 103                 tp->lrcvtime = tcp_time_stamp;
 104
 105                 /* Help sender leave slow start quickly,
 106                  * and also makes sure we do not take this
 107                  * branch ever again for this connection.
 108                  */
 109                 tp->ato = 1;
 110                 tcp_enter_quickack_mode(tp);
 111         } else {
 112                 int m = tcp_time_stamp - tp->lrcvtime;
 113
 114                 tp->lrcvtime = tcp_time_stamp;
 115                 if(m <= 0)
 116                         m = 1;
 117                 if(m > tp->rto)
 118                         tp->ato = tp->rto;
 119                 else {
 120                         /* This funny shift makes sure we
 121                          * clear the "quick ack mode" bit.
 122                          */
 123                         tp->ato = ((tp->ato << 1) >> 2) + m;
 124                 }
 125         }
 126 }
 127
 128 /*
 129  * Remember to send an ACK later.
 130  */
 131 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
 132                                         struct sk_buff *skb)
 133 {
 134         tp->delayed_acks++;
 135
 136         /* Tiny-grams with PSH set artifically deflate our
 137          * ato measurement, but with a lower bound.
 138          */
 139         if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
 140                 /* Preserve the quickack state. */
 141                 if((tp->ato & 0x7fffffff) > HZ/50)
 142                         tp->ato = ((tp->ato & 0x80000000) |
 143                                    (HZ/50));
 144         }
 145 }
 146
 147 /* Called to compute a smoothed rtt estimate. The data fed to this
 148  * routine either comes from timestamps, or from segments that were
 149  * known _not_ to have been retransmitted [see Karn/Partridge
 150  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 151  * piece by Van Jacobson.
 152  * NOTE: the next three routines used to be one big routine.
 153  * To save cycles in the RFC 1323 implementation it was better to break
 154  * it up into three procedures. -- erics
 155  */
 156
 157 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 158 {
 159         long m = mrtt; /* RTT */
 160
 161         /*      The following amusing code comes from Jacobson's
 162          *      article in SIGCOMM '88.  Note that rtt and mdev
 163          *      are scaled versions of rtt and mean deviation.
 164          *      This is designed to be as fast as possible
 165          *      m stands for "measurement".
 166          *
 167          *      On a 1990 paper the rto value is changed to:
 168          *      RTO = rtt + 4 * mdev
 169          */
 170         if(m == 0)
 171                 m = 1;
 172         if (tp->srtt != 0) {
 173                 m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 174                 tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 175                 if (m < 0)
 176                         m = -m;         /* m is now abs(error) */
 177                 m -= (tp->mdev >> 2);   /* similar update on mdev */
 178                 tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 179         } else {
 180                 /* no previous measure. */
 181                 tp->srtt = m<<3;        /* take the measured time to be rtt */
 182                 tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 183         }
 184 }
 185
 186 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 187  * routine referred to above.
 188  */
 189
 190 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 191 {
 192         tp->rto = (tp->srtt >> 3) + tp->mdev;
 193         /* I am not enough educated to understand this magic.
 194          * However, it smells bad. snd_cwnd>31 is common case.
 195          */
 196         tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 197 }
 198
 199
 200 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 201  * on packet lifetime in the internet. We need the HZ/5 lower
 202  * bound to behave correctly against BSD stacks with a fixed
 203  * delayed ack.
 204  * FIXME: It's not entirely clear this lower bound is the best
 205  * way to avoid the problem. Is it possible to drop the lower
 206  * bound and still avoid trouble with BSD stacks? Perhaps
 207  * some modification to the RTO calculation that takes delayed
 208  * ack bias into account? This needs serious thought. -- erics
 209  */
 210 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 211 {
 212         if (tp->rto > 120*HZ)
 213                 tp->rto = 120*HZ;
 214         if (tp->rto < HZ/5)
 215                 tp->rto = HZ/5;
 216 }
 217
 218 /* Save metrics learned by this TCP session.
 219    This function is called only, when TCP finishes sucessfully
 220    i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 221  */
 222 static void tcp_update_metrics(struct sock *sk)
 223 {
 224         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 225         struct dst_entry *dst = __sk_dst_get(sk);
 226
 227         if (dst) {
 228                 int m;
 229
 230                 if (tp->backoff || !tp->srtt) {
 231                         /* This session failed to estimate rtt. Why?
 232                          * Probably, no packets returned in time.
 233                          * Reset our results.
 234                          */
 235                         if (!(dst->mxlock&(1<<RTAX_RTT)))
 236                                 dst->rtt = 0;
 237                         return;
 238                 }
 239
 240                 dst_confirm(dst);
 241
 242                 m = dst->rtt - tp->srtt;
 243
 244                 /* If newly calculated rtt larger than stored one,
 245                  * store new one. Otherwise, use EWMA. Remember,
 246                  * rtt overestimation is always better than underestimation.
 247                  */
 248                 if (!(dst->mxlock&(1<<RTAX_RTT))) {
 249                         if (m <= 0)
 250                                 dst->rtt = tp->srtt;
 251                         else
 252                                 dst->rtt -= (m>>3);
 253                 }
 254
 255                 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
 256                         if (m < 0)
 257                                 m = -m;
 258
 259                         /* Scale deviation to rttvar fixed point */
 260                         m >>= 1;
 261                         if (m < tp->mdev)
 262                                 m = tp->mdev;
 263
 264                         if (m >= dst->rttvar)
 265                                 dst->rttvar = m;
 266                         else
 267                                 dst->rttvar -= (dst->rttvar - m)>>2;
 268                 }
 269
 270                 if (tp->snd_ssthresh == 0x7FFFFFFF) {
 271                         /* Slow start still did not finish. */
 272                         if (dst->ssthresh &&
 273                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 274                             tp->snd_cwnd > dst->ssthresh)
 275                                 dst->ssthresh = tp->snd_cwnd;
 276                         if (!(dst->mxlock&(1<<RTAX_CWND)) &&
 277                             tp->snd_cwnd > dst->cwnd)
 278                                 dst->cwnd = tp->snd_cwnd;
 279                 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
 280                         /* Cong. avoidance phase, cwnd is reliable. */
 281                         if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
 282                                 dst->ssthresh = tp->snd_cwnd;
 283                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 284                                 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
 285                 } else {
 286                         /* Else slow start did not finish, cwnd is non-sense,
 287                            ssthresh may be also invalid.
 288                          */
 289                         if (!(dst->mxlock&(1<<RTAX_CWND)))
 290                                 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
 291                         if (dst->ssthresh &&
 292                             !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
 293                             tp->snd_ssthresh > dst->ssthresh)
 294                                 dst->ssthresh = tp->snd_ssthresh;
 295                 }
 296         }
 297 }
 298
 299 /* Initialize metrics on socket. */
 300
 301 static void tcp_init_metrics(struct sock *sk)
 302 {
 303         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 304         struct dst_entry *dst = __sk_dst_get(sk);
 305
 306         if (dst == NULL)
 307                 goto reset;
 308
 309         dst_confirm(dst);
 310
 311         if (dst->rtt == 0)
 312                 goto reset;
 313
 314         if (!tp->srtt || !tp->saw_tstamp)
 315                 goto reset;
 316
 317         /* Initial rtt is determined from SYN,SYN-ACK.
 318          * The segment is small and rtt may appear much
 319          * less than real one. Use per-dst memory
 320          * to make it more realistic.
 321          *
 322          * A bit of theory. RTT is time passed after "normal" sized packet
 323          * is sent until it is ACKed. In normal curcumstances sending small
 324          * packets force peer to delay ACKs and calculation is correct too.
 325          * The algorithm is adaptive and, provided we follow specs, it
 326          * NEVER underestimate RTT. BUT! If peer tries to make some clever
 327          * tricks sort of "quick acks" for time long enough to decrease RTT
 328          * to low value, and then abruptly stops to do it and starts to delay
 329          * ACKs, wait for troubles.
 330          */
 331         if (dst->rtt > tp->srtt)
 332                 tp->srtt = dst->rtt;
 333         if (dst->rttvar > tp->mdev)
 334                 tp->mdev = dst->rttvar;
 335         tcp_set_rto(tp);
 336         tcp_bound_rto(tp);
 337
 338         if (dst->mxlock&(1<<RTAX_CWND))
 339                 tp->snd_cwnd_clamp = dst->cwnd;
 340         if (dst->ssthresh) {
 341                 tp->snd_ssthresh = dst->ssthresh;
 342                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 343                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
 344         }
 345         return;
 346
 347
 348 reset:
 349         /* Play conservative. If timestamps are not
 350          * supported, TCP will fail to recalculate correct
 351          * rtt, if initial rto is too small. FORGET ALL AND RESET!
 352          */
 353         if (!tp->saw_tstamp && tp->srtt) {
 354                 tp->srtt = 0;
 355                 tp->mdev = TCP_TIMEOUT_INIT;
 356                 tp->rto = TCP_TIMEOUT_INIT;
 357         }
 358 }
 359
 360 #define PAWS_24DAYS     (60 * 60 * 24 * 24)
 361
 362
 363 /* WARNING: this must not be called if tp->saw_tstamp was false. */
 364 extern __inline__ void
 365 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 366 {
 367         if (!after(seq, tp->last_ack_sent)) {
 368                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 369                  * extra check below makes sure this can only happen
 370                  * for pure ACK frames.  -DaveM
 371                  *
 372                  * Not only, also it occurs for expired timestamps
 373                  * and RSTs with bad timestamp option. --ANK
 374                  */
 375
 376                 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
 377                    xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
 378                         tp->ts_recent = tp->rcv_tsval;
 379                         tp->ts_recent_stamp = xtime.tv_sec;
 380                 }
 381         }
 382 }
 383
 384 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 385 {
 386         return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
 387                 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
 388
 389                  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 390
 391                     I cannot see quitely as all the idea behind PAWS
 392                     is destroyed 8)
 393
 394                     The problem is only in reordering duplicate ACKs.
 395                     Hence, we can check this rare case more carefully.
 396
 397                     1. Check that it is really duplicate ACK (ack==snd_una)
 398                     2. Give it some small "replay" window (~RTO)
 399
 400                     We do not know units of foreign ts values, but make conservative
 401                     assumption that they are >=1ms. It solves problem
 402                     noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
 403                   */
 404                  && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
 405                      TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
 406                      !skb->h.th->ack ||
 407                      (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
 408 }
 409
 410
 411 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 412 {
 413         u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 414
 415         if (tp->rcv_wnd &&
 416             after(end_seq, tp->rcv_nxt) &&
 417             before(seq, end_window))
 418                 return 1;
 419         if (seq != end_window)
 420                 return 0;
 421         return (seq == end_seq);
 422 }
 423
 424 /* This functions checks to see if the tcp header is actually acceptable. */
 425 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 426 {
 427         if (seq == tp->rcv_nxt)
 428                 return (tp->rcv_wnd || (end_seq == seq));
 429
 430         return __tcp_sequence(tp, seq, end_seq);
 431 }
 432
 433 /* When we get a reset we do this. */
 434 static void tcp_reset(struct sock *sk)
 435 {
 436         sk->zapped = 1;
 437
 438         /* We want the right error as BSD sees it (and indeed as we do). */
 439         switch (sk->state) {
 440                 case TCP_SYN_SENT:
 441                         sk->err = ECONNREFUSED;
 442                         break;
 443                 case TCP_CLOSE_WAIT:
 444                         sk->err = EPIPE;
 445                         break;
 446                 case TCP_CLOSE:
 447                         return;
 448                 default:
 449                         sk->err = ECONNRESET;
 450         };
 451         tcp_set_state(sk, TCP_CLOSE);
 452         tcp_clear_xmit_timers(sk);
 453         tcp_done(sk);
 454 }
 455
 456 /* This tags the retransmission queue when SACKs arrive. */
 457 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 458 {
 459         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 460         int i = nsacks;
 461
 462         while(i--) {
 463                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 464                 __u32 start_seq = ntohl(sp->start_seq);
 465                 __u32 end_seq = ntohl(sp->end_seq);
 466                 int fack_count = 0;
 467
 468                 while((skb != NULL) &&
 469                       (skb != tp->send_head) &&
 470                       (skb != (struct sk_buff *)&sk->write_queue)) {
 471                         /* The retransmission queue is always in order, so
 472                          * we can short-circuit the walk early.
 473                          */
 474                         if(after(TCP_SKB_CB(skb)->seq, end_seq))
 475                                 break;
 476
 477                         /* We play conservative, we don't allow SACKS to partially
 478                          * tag a sequence space.
 479                          */
 480                         fack_count++;
 481                         if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 482                            !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 483                                 /* If this was a retransmitted frame, account for it. */
 484                                 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 485                                    tp->retrans_out)
 486                                         tp->retrans_out--;
 487                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 488
 489                                 /* RULE: All new SACKs will either decrease retrans_out
 490                                  *       or advance fackets_out.
 491                                  */
 492                                 if(fack_count > tp->fackets_out)
 493                                         tp->fackets_out = fack_count;
 494                         }
 495                         skb = skb->next;
 496                 }
 497                 sp++; /* Move on to the next SACK block. */
 498         }
 499 }
 500
 501 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
 502  * But, this can also be called on packets in the established flow when
 503  * the fast version below fails.
 504  */
 505 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 506 {
 507         unsigned char *ptr;
 508         int length=(th->doff*4)-sizeof(struct tcphdr);
 509
 510         ptr = (unsigned char *)(th + 1);
 511         tp->saw_tstamp = 0;
 512
 513         while(length>0) {
 514                 int opcode=*ptr++;
 515                 int opsize;
 516
 517                 switch (opcode) {
 518                         case TCPOPT_EOL:
 519                                 return;
 520                         case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 521                                 length--;
 522                                 continue;
 523                         default:
 524                                 opsize=*ptr++;
 525                                 if (opsize < 2) /* "silly options" */
 526                                         return;
 527                                 if (opsize > length)
 528                                         break;  /* don't parse partial options */
 529                                 switch(opcode) {
 530                                 case TCPOPT_MSS:
 531                                         if(opsize==TCPOLEN_MSS && th->syn) {
 532                                                 u16 in_mss = ntohs(*(__u16 *)ptr);
 533                                                 if (in_mss) {
 534                                                         if (tp->user_mss && tp->user_mss < in_mss)
 535                                                                 in_mss = tp->user_mss;
 536                                                         tp->mss_clamp = in_mss;
 537                                                 }
 538                                         }
 539                                         break;
 540                                 case TCPOPT_WINDOW:
 541                                         if(opsize==TCPOLEN_WINDOW && th->syn)
 542                                                 if (!no_fancy && sysctl_tcp_window_scaling) {
 543                                                         tp->wscale_ok = 1;
 544                                                         tp->snd_wscale = *(__u8 *)ptr;
 545                                                         if(tp->snd_wscale > 14) {
 546                                                                 if(net_ratelimit())
 547                                                                         printk("tcp_parse_options: Illegal window "
 548                                                                                "scaling value %d >14 received.",
 549                                                                                tp->snd_wscale);
 550                                                                 tp->snd_wscale = 14;
 551                                                         }
 552                                                 }
 553                                         break;
 554                                 case TCPOPT_TIMESTAMP:
 555                                         if(opsize==TCPOLEN_TIMESTAMP) {
 556                                                 if (sysctl_tcp_timestamps && !no_fancy) {
 557                                                         tp->tstamp_ok = 1;
 558                                                         tp->saw_tstamp = 1;
 559                                                         tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 560                                                         tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 561                                                 }
 562                                         }
 563                                         break;
 564                                 case TCPOPT_SACK_PERM:
 565                                         if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 566                                                 if (sysctl_tcp_sack && !no_fancy) {
 567                                                         tp->sack_ok = 1;
 568                                                         tp->num_sacks = 0;
 569                                                 }
 570                                         }
 571                                         break;
 572
 573                                 case TCPOPT_SACK:
 574                                         if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 575                                            sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 576                                                 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 577
 578                                                 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 579                                                         int num_sacks = sack_bytes >> 3;
 580                                                         struct tcp_sack_block *sackp;
 581
 582                                                         sackp = (struct tcp_sack_block *)ptr;
 583                                                         tcp_sacktag_write_queue(sk, sackp, num_sacks);
 584                                                 }
 585                                         }
 586                                 };
 587                                 ptr+=opsize-2;
 588                                 length-=opsize;
 589                 };
 590         }
 591 }
 592
 593 /* Fast parse options. This hopes to only see timestamps.
 594  * If it is wrong it falls back on tcp_parse_options().
 595  */
 596 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 597 {
 598         /* If we didn't send out any options ignore them all. */
 599         if (tp->tcp_header_len == sizeof(struct tcphdr))
 600                 return 0;
 601         if (th->doff == sizeof(struct tcphdr)>>2) {
 602                 tp->saw_tstamp = 0;
 603                 return 0;
 604         } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 605                 __u32 *ptr = (__u32 *)(th + 1);
 606                 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 607                                              | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 608                         tp->saw_tstamp = 1;
 609                         ++ptr;
 610                         tp->rcv_tsval = ntohl(*ptr);
 611                         ++ptr;
 612                         tp->rcv_tsecr = ntohl(*ptr);
 613                         return 1;
 614                 }
 615         }
 616         tcp_parse_options(sk, th, tp, 0);
 617         return 1;
 618 }
 619
 620 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 621 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 622 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 623 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 624 #define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged new data.         */
 625
 626 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 627 {
 628         if (tp->dup_acks > 3)
 629                 tp->snd_cwnd = (tp->snd_ssthresh);
 630
 631         tp->dup_acks = 0;
 632 }
 633
 634 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
 635  * retransmit timer fires.
 636  */
 637 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 638 {
 639         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 640
 641         /* Note: If not_dup is set this implies we got a
 642          * data carrying packet or a window update.
 643          * This carries no new information about possible
 644          * lost packets, so we have to ignore it for the purposes
 645          * of counting duplicate acks. Ideally this does not imply we
 646          * should stop our fast retransmit phase, more acks may come
 647          * later without data to help us. Unfortunately this would make
 648          * the code below much more complex. For now if I see such
 649          * a packet I clear the fast retransmit phase.
 650          */
 651         if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 652                 /* This is the standard reno style fast retransmit branch. */
 653
 654                 /* 1. When the third duplicate ack is received, set ssthresh
 655                  * to one half the current congestion window, but no less
 656                  * than two segments. Retransmit the missing segment.
 657                  */
 658                 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 659                         tp->dup_acks++;
 660                         if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 661                                 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 662                                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 663                                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
 664                                 tp->snd_cwnd = (tp->snd_ssthresh + 3);
 665                                 tp->high_seq = tp->snd_nxt;
 666                                 if(!tp->fackets_out)
 667                                         tcp_retransmit_skb(sk,
 668                                                            skb_peek(&sk->write_queue));
 669                                 else
 670                                         tcp_fack_retransmit(sk);
 671                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 672                         }
 673                 } else if (++tp->dup_acks > 3) {
 674                         /* 2. Each time another duplicate ACK arrives, increment
 675                          * cwnd by the segment size. [...] Transmit a packet...
 676                          *
 677                          * Packet transmission will be done on normal flow processing
 678                          * since we're not in "retransmit mode".  We do not use
 679                          * duplicate ACKs to artificially inflate the congestion
 680                          * window when doing FACK.
 681                          */
 682                         if(!tp->fackets_out) {
 683                                 tp->snd_cwnd++;
 684                         } else {
 685                                 /* Fill any further holes which may have
 686                                  * appeared.
 687                                  *
 688                                  * We may want to change this to run every
 689                                  * further multiple-of-3 dup ack increments,
 690                                  * to be more robust against out-of-order
 691                                  * packet delivery.  -DaveM
 692                                  */
 693                                 tcp_fack_retransmit(sk);
 694                         }
 695                 }
 696         } else if (tp->high_seq != 0) {
 697                 /* In this branch we deal with clearing the Floyd style
 698                  * block on duplicate fast retransmits, and if requested
 699                  * we do Hoe style secondary fast retransmits.
 700                  */
 701                 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 702                         /* Once we have acked all the packets up to high_seq
 703                          * we are done this fast retransmit phase.
 704                          * Alternatively data arrived. In this case we
 705                          * Have to abort the fast retransmit attempt.
 706                          * Note that we do want to accept a window
 707                          * update since this is expected with Hoe's algorithm.
 708                          */
 709                         clear_fast_retransmit(tp);
 710
 711                         /* After we have cleared up to high_seq we can
 712                          * clear the Floyd style block.
 713                          */
 714                         if (!before(ack, tp->high_seq)) {
 715                                 tp->high_seq = 0;
 716                                 tp->fackets_out = 0;
 717                         }
 718                 } else if (tp->dup_acks >= 3) {
 719                         if (!tp->fackets_out) {
 720                                 /* Hoe Style. We didn't ack the whole
 721                                  * window. Take this as a cue that
 722                                  * another packet was lost and retransmit it.
 723                                  * Don't muck with the congestion window here.
 724                                  * Note that we have to be careful not to
 725                                  * act if this was a window update and it
 726                                  * didn't ack new data, since this does
 727                                  * not indicate a packet left the system.
 728                                  * We can test this by just checking
 729                                  * if ack changed from snd_una, since
 730                                  * the only way to get here without advancing
 731                                  * from snd_una is if this was a window update.
 732                                  */
 733                                 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 734                                         tcp_retransmit_skb(sk,
 735                                                            skb_peek(&sk->write_queue));
 736                                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 737                                 }
 738                         } else {
 739                                 /* FACK style, fill any remaining holes in
 740                                  * receiver's queue.
 741                                  */
 742                                 tcp_fack_retransmit(sk);
 743                         }
 744                 }
 745         }
 746 }
 747
 748 /* This is Jacobson's slow start and congestion avoidance.
 749  * SIGCOMM '88, p. 328.
 750  */
 751 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 752 {
 753         if (tp->snd_cwnd <= tp->snd_ssthresh) {
 754                 /* In "safe" area, increase. */
 755                 tp->snd_cwnd++;
 756         } else {
 757                 /* In dangerous area, increase slowly.
 758                  * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 759                  */
 760                 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 761                         if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 762                                 tp->snd_cwnd++;
 763                         tp->snd_cwnd_cnt=0;
 764                 } else
 765                         tp->snd_cwnd_cnt++;
 766         }
 767 }
 768
 769 /* Remove acknowledged frames from the retransmission queue. */
 770 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 771                                __u32 *seq, __u32 *seq_rtt)
 772 {
 773         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 774         struct sk_buff *skb;
 775         __u32 now = tcp_time_stamp;
 776         int acked = 0;
 777
 778         /* If we are retransmitting, and this ACK clears up to
 779          * the retransmit head, or further, then clear our state.
 780          */
 781         if (tp->retrans_head != NULL &&
 782             !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 783                 tp->retrans_head = NULL;
 784
 785         while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 786                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 787                 __u8 sacked = scb->sacked;
 788
 789                 /* If our packet is before the ack sequence we can
 790                  * discard it as it's confirmed to have arrived at
 791                  * the other end.
 792                  */
 793                 if (after(scb->end_seq, ack))
 794                         break;
 795
 796                 /* Initial outgoing SYN's get put onto the write_queue
 797                  * just like anything else we transmit.  It is not
 798                  * true data, and if we misinform our callers that
 799                  * this ACK acks real data, we will erroneously exit
 800                  * connection startup slow start one packet too
 801                  * quickly.  This is severely frowned upon behavior.
 802                  */
 803                 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 804                         tp->retrans_out--;
 805                 if(!(scb->flags & TCPCB_FLAG_SYN)) {
 806                         acked |= FLAG_DATA_ACKED;
 807                         if(sacked & TCPCB_SACKED_RETRANS)
 808                                 acked |= FLAG_RETRANS_DATA_ACKED;
 809                         if(tp->fackets_out)
 810                                 tp->fackets_out--;
 811                 } else {
 812                         acked |= FLAG_SYN_ACKED;
 813                         /* This is pure paranoia. */
 814                         tp->retrans_head = NULL;
 815                 }
 816                 tp->packets_out--;
 817                 *seq = scb->seq;
 818                 *seq_rtt = now - scb->when;
 819                 __skb_unlink(skb, skb->list);
 820                 kfree_skb(skb);
 821         }
 822         return acked;
 823 }
 824
 825 static void tcp_ack_probe(struct sock *sk, __u32 ack)
 826 {
 827         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 828
 829         /* Our probe was answered. */
 830         tp->probes_out = 0;
 831
 832         /* Was it a usable window open? */
 833
 834         /* should always be non-null */
 835         if (tp->send_head != NULL &&
 836             !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 837                 tp->backoff = 0;
 838                 tp->pending = 0;
 839                 tcp_clear_xmit_timer(sk, TIME_PROBE0);
 840         } else {
 841                 tcp_reset_xmit_timer(sk, TIME_PROBE0,
 842                                      min(tp->rto << tp->backoff, 120*HZ));
 843         }
 844 }
 845
 846 /* Should we open up the congestion window? */
 847 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 848 {
 849         /* Data must have been acked. */
 850         if ((flag & FLAG_DATA_ACKED) == 0)
 851                 return 0;
 852
 853         /* Some of the data acked was retransmitted somehow? */
 854         if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 855                 /* We advance in all cases except during
 856                  * non-FACK fast retransmit/recovery.
 857                  */
 858                 if (tp->fackets_out != 0 ||
 859                     tp->retransmits != 0)
 860                         return 1;
 861
 862                 /* Non-FACK fast retransmit does it's own
 863                  * congestion window management, don't get
 864                  * in the way.
 865                  */
 866                 return 0;
 867         }
 868
 869         /* New non-retransmitted data acked, always advance.  */
 870         return 1;
 871 }
 872
 873 /* Read draft-ietf-tcplw-high-performance before mucking
 874  * with this code. (Superceeds RFC1323)
 875  */
 876 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 877                                u32 seq, u32 ack, int flag)
 878 {
 879         __u32 seq_rtt;
 880
 881         /* RTTM Rule: A TSecr value received in a segment is used to
 882          * update the averaged RTT measurement only if the segment
 883          * acknowledges some new data, i.e., only if it advances the
 884          * left edge of the send window.
 885          *
 886          * See draft-ietf-tcplw-high-performance-00, section 3.3.
 887          * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 888          */
 889         if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
 890                 return;
 891
 892         seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
 893         tcp_rtt_estimator(tp, seq_rtt);
 894         if (tp->retransmits) {
 895                 if (tp->packets_out == 0) {
 896                         tp->retransmits = 0;
 897                         tp->fackets_out = 0;
 898                         tp->retrans_out = 0;
 899                         tp->backoff = 0;
 900                         tcp_set_rto(tp);
 901                 } else {
 902                         /* Still retransmitting, use backoff */
 903                         tcp_set_rto(tp);
 904                         tp->rto = tp->rto << tp->backoff;
 905                 }
 906         } else {
 907                 tcp_set_rto(tp);
 908         }
 909
 910         tcp_bound_rto(tp);
 911 }
 912
 913 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 914 {
 915         struct sk_buff *skb = skb_peek(&sk->write_queue);
 916
 917         /* Some data was ACK'd, if still retransmitting (due to a
 918          * timeout), resend more of the retransmit queue.  The
 919          * congestion window is handled properly by that code.
 920          */
 921         if (tp->retransmits) {
 922                 tcp_xmit_retransmit_queue(sk);
 923                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 924         } else {
 925                 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 926                 if ((__s32)when < 0)
 927                         when = 1;
 928                 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 929         }
 930 }
 931
 932 /* This routine deals with incoming acks, but not outgoing ones. */
 933 static int tcp_ack(struct sock *sk, struct tcphdr *th,
 934                    u32 ack_seq, u32 ack, int len)
 935 {
 936         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 937         int flag = 0;
 938         u32 seq = 0;
 939         u32 seq_rtt = 0;
 940
 941         if(sk->zapped)
 942                 return(1);      /* Dead, can't ack any more so why bother */
 943
 944         if (tp->pending == TIME_KEEPOPEN)
 945                 tp->probes_out = 0;
 946
 947         tp->rcv_tstamp = tcp_time_stamp;
 948
 949         /* If the ack is newer than sent or older than previous acks
 950          * then we can probably ignore it.
 951          */
 952         if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 953                 goto uninteresting_ack;
 954
 955         /* If there is data set flag 1 */
 956         if (len != th->doff*4) {
 957                 flag |= FLAG_DATA;
 958                 tcp_delack_estimator(tp);
 959         }
 960
 961         /* Update our send window. */
 962
 963         /* This is the window update code as per RFC 793
 964          * snd_wl{1,2} are used to prevent unordered
 965          * segments from shrinking the window
 966          */
 967         if (before(tp->snd_wl1, ack_seq) ||
 968             (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 969                 u32 nwin = ntohs(th->window) << tp->snd_wscale;
 970
 971                 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 972                         flag |= FLAG_WIN_UPDATE;
 973                         tp->snd_wnd = nwin;
 974
 975                         tp->snd_wl1 = ack_seq;
 976                         tp->snd_wl2 = ack;
 977
 978                         if (nwin > tp->max_window)
 979                                 tp->max_window = nwin;
 980                 }
 981         }
 982
 983         /* We passed data and got it acked, remove any soft error
 984          * log. Something worked...
 985          */
 986         sk->err_soft = 0;
 987
 988         /* If this ack opens up a zero window, clear backoff.  It was
 989          * being used to time the probes, and is probably far higher than
 990          * it needs to be for normal retransmission.
 991          */
 992         if (tp->pending == TIME_PROBE0)
 993                 tcp_ack_probe(sk, ack);
 994
 995         /* See if we can take anything off of the retransmit queue. */
 996         flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 997
 998         /* We must do this here, before code below clears out important
 999          * state contained in tp->fackets_out and tp->retransmits.  -DaveM
1000          */
1001         if (should_advance_cwnd(tp, flag))
1002                 tcp_cong_avoid(tp);
1003
1004         /* If we have a timestamp, we always do rtt estimates. */
1005         if (tp->saw_tstamp) {
1006                 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1007         } else {
1008                 /* If we were retransmiting don't count rtt estimate. */
1009                 if (tp->retransmits) {
1010                         if (tp->packets_out == 0) {
1011                                 tp->retransmits = 0;
1012                                 tp->fackets_out = 0;
1013                                 tp->retrans_out = 0;
1014                         }
1015                 } else {
1016                         /* We don't have a timestamp. Can only use
1017                          * packets that are not retransmitted to determine
1018                          * rtt estimates. Also, we must not reset the
1019                          * backoff for rto until we get a non-retransmitted
1020                          * packet. This allows us to deal with a situation
1021                          * where the network delay has increased suddenly.
1022                          * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1023                          */
1024                         if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1025                                 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1026                                         tp->backoff = 0;
1027                                         tcp_rtt_estimator(tp, seq_rtt);
1028                                         tcp_set_rto(tp);
1029                                         tcp_bound_rto(tp);
1030                                 }
1031                         }
1032                 }
1033         }
1034
1035         if (tp->packets_out) {
1036                 if (flag & FLAG_DATA_ACKED)
1037                         tcp_ack_packets_out(sk, tp);
1038         } else {
1039                 tcp_clear_xmit_timer(sk, TIME_RETRANS);
1040         }
1041
1042         flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1043         if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1044             (tp->high_seq != 0)) {
1045                 tcp_fast_retrans(sk, ack, flag);
1046         } else {
1047                 /* Clear any aborted fast retransmit starts. */
1048                 tp->dup_acks = 0;
1049         }
1050         /* It is not a brain fart, I thought a bit now. 8)
1051          *
1052          * Forward progress is indicated, if:
1053          *   1. the ack acknowledges new data.
1054          *   2. or the ack is duplicate, but it is caused by new segment
1055          *      arrival. This case is filtered by:
1056          *      - it contains no data, syn or fin.
1057          *      - it does not update window.
1058          *   3. or new SACK. It is difficult to check, so that we ignore it.
1059          *
1060          * Forward progress is also indicated by arrival new data,
1061          * which was caused by window open from our side. This case is more
1062          * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1063          *                                              --ANK (990513)
1064          */
1065         if (ack != tp->snd_una || (flag == 0 && !th->fin))
1066                 dst_confirm(sk->dst_cache);
1067
1068         /* Remember the highest ack received. */
1069         tp->snd_una = ack;
1070         return 1;
1071
1072 uninteresting_ack:
1073         SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1074         return 0;
1075 }
1076
1077 /* New-style handling of TIME_WAIT sockets. */
1078
1079 /* Must be called only from BH context. */
1080 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1081 {
1082         struct tcp_ehash_bucket *ehead;
1083         struct tcp_bind_hashbucket *bhead;
1084         struct tcp_bind_bucket *tb;
1085
1086         /* Unlink from established hashes. */
1087         ehead = &tcp_ehash[tw->hashent];
1088         write_lock(&ehead->lock);
1089         if (!tw->pprev) {
1090                 write_unlock(&ehead->lock);
1091                 return;
1092         }
1093         if(tw->next)
1094                 tw->next->pprev = tw->pprev;
1095         *(tw->pprev) = tw->next;
1096         tw->pprev = NULL;
1097         write_unlock(&ehead->lock);
1098
1099         /* Disassociate with bind bucket. */
1100         bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1101         spin_lock(&bhead->lock);
1102         if ((tb = tw->tb) != NULL) {
1103                 if(tw->bind_next)
1104                         tw->bind_next->bind_pprev = tw->bind_pprev;
1105                 *(tw->bind_pprev) = tw->bind_next;
1106                 tw->tb = NULL;
1107                 if (tb->owners == NULL) {
1108                         if (tb->next)
1109                                 tb->next->pprev = tb->pprev;
1110                         *(tb->pprev) = tb->next;
1111                         kmem_cache_free(tcp_bucket_cachep, tb);
1112                 }
1113         }
1114         spin_unlock(&bhead->lock);
1115
1116 #ifdef INET_REFCNT_DEBUG
1117         if (atomic_read(&tw->refcnt) != 1) {
1118                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1119         }
1120 #endif
1121         tcp_tw_put(tw);
1122 }
1123
1124 /* We come here as a special case from the AF specific TCP input processing,
1125  * and the SKB has no owner.  Essentially handling this is very simple,
1126  * we just keep silently eating rx'd packets until none show up for the
1127  * entire timeout period.  The only special cases are for BSD TIME_WAIT
1128  * reconnects and SYN/RST bits being set in the TCP header.
1129  */
1130
1131 /*
1132  * * Main purpose of TIME-WAIT state is to close connection gracefully,
1133  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1134  *   (and, probably, tail of data) and one or more our ACKs are lost.
1135  * * What is TIME-WAIT timeout? It is associated with maximal packet
1136  *   lifetime in the internet, which results in wrong conclusion, that
1137  *   it is set to catch "old duplicate segments" wandering out of their path.
1138  *   It is not quite correct. This timeout is calculated so that it exceeds
1139  *   maximal retransmision timeout enough to allow to lose one (or more)
1140  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
1141  * * When TIME-WAIT socket receives RST, it means that another end
1142  *   finally closed and we are allowed to kill TIME-WAIT too.
1143  * * Second purpose of TIME-WAIT is catching old duplicate segments.
1144  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
1145  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1146  * * If we invented some more clever way to catch duplicates
1147  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1148  *
1149  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1150  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1151  * from the very beginning.
1152  */
1153 enum tcp_tw_status
1154 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1155                            struct tcphdr *th, unsigned len)
1156 {
1157         struct tcp_opt tp;
1158         int paws_reject = 0;
1159
1160         /*      RFC 1122:
1161          *      "When a connection is [...] on TIME-WAIT state [...]
1162          *      [a TCP] MAY accept a new SYN from the remote TCP to
1163          *      reopen the connection directly, if it:
1164          *
1165          *      (1)  assigns its initial sequence number for the new
1166          *      connection to be larger than the largest sequence
1167          *      number it used on the previous connection incarnation,
1168          *      and
1169          *
1170          *      (2)  returns to TIME-WAIT state if the SYN turns out
1171          *      to be an old duplicate".
1172          */
1173
1174         tp.saw_tstamp = 0;
1175         if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1176                 tcp_parse_options(NULL, th, &tp, 0);
1177
1178                 paws_reject = tp.saw_tstamp &&
1179                         ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
1180                          xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
1181         }
1182
1183         if (!paws_reject &&
1184             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1185              TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1186                 /* In window segment, it may be only reset or bare ack. */
1187
1188                 if (th->rst) {
1189 #ifdef CONFIG_TCP_TW_RECYCLE
1190                         /* When recycling, always follow rfc1337,
1191                          * but mark bucket as ready to recycling immediately.
1192                          */
1193                         if (sysctl_tcp_tw_recycle) {
1194                                 /* May kill it now. */
1195                                 tw->rto = 0;
1196                                 tw->ttd = jiffies;
1197                         } else
1198 #endif
1199                         /* This is TIME_WAIT assasination, in two flavors.
1200                          * Oh well... nobody has a sufficient solution to this
1201                          * protocol bug yet.
1202                          */
1203                         if(sysctl_tcp_rfc1337 == 0) {
1204                                 tcp_tw_deschedule(tw);
1205                                 tcp_timewait_kill(tw);
1206                         }
1207                 } else {
1208                         tcp_tw_reschedule(tw);
1209                 }
1210
1211                 if (tp.saw_tstamp) {
1212                         tw->ts_recent = tp.rcv_tsval;
1213                         tw->ts_recent_stamp = xtime.tv_sec;
1214                 }
1215                 tcp_tw_put(tw);
1216                 return TCP_TW_SUCCESS;
1217         }
1218
1219         /* Out of window segment.
1220
1221            All the segments are ACKed immediately.
1222
1223            The only exception is new SYN. We accept it, if it is
1224            not old duplicate and we are not in danger to be killed
1225            by delayed old duplicates. RFC check is that it has
1226            newer sequence number works at rates <40Mbit/sec.
1227            However, if paws works, it is reliable AND even more,
1228            we even may relax silly seq space cutoff.
1229
1230            RED-PEN: we violate main RFC requirement, if this SYN will appear
1231            old duplicate (i.e. we receive RST in reply to SYN-ACK),
1232            we must return socket to time-wait state. It is not good,
1233            but not fatal yet.
1234          */
1235
1236         if (th->syn && !th->rst && !th->ack && !paws_reject &&
1237             (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1238              (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
1239                 u32 isn = tw->snd_nxt + 2;
1240                 if (isn == 0)
1241                         isn++;
1242                 TCP_SKB_CB(skb)->when = isn;
1243                 return TCP_TW_SYN;
1244         }
1245
1246         if(!th->rst) {
1247                 /* In this case we must reset the TIMEWAIT timer.
1248
1249                    If it is ACKless SYN it may be both old duplicate
1250                    and new good SYN with random sequence number <rcv_nxt.
1251                    Do not reschedule in the last case.
1252                  */
1253                 if (paws_reject || th->ack) {
1254                         tcp_tw_reschedule(tw);
1255 #ifdef CONFIG_TCP_TW_RECYCLE
1256                         tw->rto = min(120*HZ, tw->rto<<1);
1257                         tw->ttd = jiffies + tw->rto;
1258 #endif
1259                 }
1260
1261                 /* Send ACK. Note, we do not put the bucket,
1262                  * it will be released by caller.
1263                  */
1264                 return TCP_TW_ACK;
1265         }
1266         tcp_tw_put(tw);
1267         return TCP_TW_SUCCESS;
1268 }
1269
1270 /* Enter the time wait state.  This is always called from BH
1271  * context.  Essentially we whip up a timewait bucket, copy the
1272  * relevant info into it from the SK, and mess with hash chains
1273  * and list linkage.
1274  */
1275 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1276 {
1277         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1278         struct tcp_bind_hashbucket *bhead;
1279         struct sock **head, *sktw;
1280
1281         write_lock(&ehead->lock);
1282
1283         /* Step 1: Remove SK from established hash. */
1284         if (sk->pprev) {
1285                 if(sk->next)
1286                         sk->next->pprev = sk->pprev;
1287                 *sk->pprev = sk->next;
1288                 sk->pprev = NULL;
1289         }
1290
1291         /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1292         head = &(ehead + tcp_ehash_size)->chain;
1293         sktw = (struct sock *)tw;
1294         if((sktw->next = *head) != NULL)
1295                 (*head)->pprev = &sktw->next;
1296         *head = sktw;
1297         sktw->pprev = head;
1298         atomic_inc(&tw->refcnt);
1299
1300         write_unlock(&ehead->lock);
1301
1302         /* Step 3: Put TW into bind hash. Original socket stays there too.
1303            Note, that any socket with sk->num!=0 MUST be bound in binding
1304            cache, even if it is closed.
1305          */
1306         bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1307         spin_lock(&bhead->lock);
1308         tw->tb = (struct tcp_bind_bucket *)sk->prev;
1309         BUG_TRAP(sk->prev!=NULL);
1310         if ((tw->bind_next = tw->tb->owners) != NULL)
1311                 tw->tb->owners->bind_pprev = &tw->bind_next;
1312         tw->tb->owners = (struct sock*)tw;
1313         tw->bind_pprev = &tw->tb->owners;
1314         spin_unlock(&bhead->lock);
1315
1316         /* Step 4: Un-charge protocol socket in-use count. */
1317         sk->prot->inuse--;
1318 }
1319
1320 /*
1321  * Move a socket to time-wait.
1322  */
1323 void tcp_time_wait(struct sock *sk)
1324 {
1325         struct tcp_tw_bucket *tw;
1326
1327         tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1328         if(tw != NULL) {
1329                 /* Give us an identity. */
1330                 tw->daddr       = sk->daddr;
1331                 tw->rcv_saddr   = sk->rcv_saddr;
1332                 tw->bound_dev_if= sk->bound_dev_if;
1333                 tw->num         = sk->num;
1334                 tw->state       = TCP_TIME_WAIT;
1335                 tw->sport       = sk->sport;
1336                 tw->dport       = sk->dport;
1337                 tw->family      = sk->family;
1338                 tw->reuse       = sk->reuse;
1339                 tw->hashent     = sk->hashent;
1340                 tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
1341                 tw->snd_nxt     = sk->tp_pinfo.af_tcp.snd_nxt;
1342                 tw->ts_recent   = sk->tp_pinfo.af_tcp.ts_recent;
1343                 tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
1344 #ifdef CONFIG_TCP_TW_RECYCLE
1345                 tw->rto         = sk->tp_pinfo.af_tcp.rto;
1346                 tw->ttd         = jiffies + 2*tw->rto;
1347 #endif
1348                 atomic_set(&tw->refcnt, 0);
1349
1350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1351                 if(tw->family == PF_INET6) {
1352                         memcpy(&tw->v6_daddr,
1353                                &sk->net_pinfo.af_inet6.daddr,
1354                                sizeof(struct in6_addr));
1355                         memcpy(&tw->v6_rcv_saddr,
1356                                &sk->net_pinfo.af_inet6.rcv_saddr,
1357                                sizeof(struct in6_addr));
1358                 }
1359 #endif
1360                 /* Linkage updates. */
1361                 __tcp_tw_hashdance(sk, tw);
1362
1363                 /* Get the TIME_WAIT timeout firing. */
1364                 tcp_tw_schedule(tw);
1365
1366                 /* CLOSE the SK. */
1367                 if(sk->state == TCP_ESTABLISHED)
1368                         tcp_statistics.TcpCurrEstab--;
1369                 sk->state = TCP_CLOSE;
1370         } else {
1371                 /* Sorry, we're out of memory, just CLOSE this
1372                  * socket up.  We've got bigger problems than
1373                  * non-graceful socket closings.
1374                  */
1375                 tcp_set_state(sk, TCP_CLOSE);
1376         }
1377
1378         tcp_update_metrics(sk);
1379         tcp_clear_xmit_timers(sk);
1380         tcp_done(sk);
1381 }
1382
1383 /*
1384  *      Process the FIN bit. This now behaves as it is supposed to work
1385  *      and the FIN takes effect when it is validly part of sequence
1386  *      space. Not before when we get holes.
1387  *
1388  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1389  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1390  *      TIME-WAIT)
1391  *
1392  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1393  *      close and we go into CLOSING (and later onto TIME-WAIT)
1394  *
1395  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1396  */
1397
1398 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1399 {
1400         sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1401
1402         tcp_send_ack(sk);
1403
1404         if (!sk->dead) {
1405                 wake_up_interruptible(sk->sleep);
1406                 sock_wake_async(sk->socket, 1);
1407         }
1408
1409         switch(sk->state) {
1410                 case TCP_SYN_RECV:
1411                 case TCP_ESTABLISHED:
1412                         /* Move to CLOSE_WAIT */
1413                         tcp_set_state(sk, TCP_CLOSE_WAIT);
1414                         break;
1415
1416                 case TCP_CLOSE_WAIT:
1417                 case TCP_CLOSING:
1418                         /* Received a retransmission of the FIN, do
1419                          * nothing.
1420                          */
1421                         break;
1422                 case TCP_LAST_ACK:
1423                         /* RFC793: Remain in the LAST-ACK state. */
1424                         break;
1425
1426                 case TCP_FIN_WAIT1:
1427                         /* This case occurs when a simultaneous close
1428                          * happens, we must ack the received FIN and
1429                          * enter the CLOSING state.
1430                          */
1431                         tcp_set_state(sk, TCP_CLOSING);
1432                         break;
1433                 case TCP_FIN_WAIT2:
1434                         /* Received a FIN -- send ACK and enter TIME_WAIT. */
1435                         tcp_time_wait(sk);
1436                         break;
1437                 default:
1438                         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1439                          * cases we should never reach this piece of code.
1440                          */
1441                         printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1442                         break;
1443         };
1444 }
1445
1446 /* These routines update the SACK block as out-of-order packets arrive or
1447  * in-order packets close up the sequence space.
1448  */
1449 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1450 {
1451         int this_sack, num_sacks = tp->num_sacks;
1452         struct tcp_sack_block *swalk = &tp->selective_acks[0];
1453
1454         /* If more than one SACK block, see if the recent change to SP eats into
1455          * or hits the sequence space of other SACK blocks, if so coalesce.
1456          */
1457         if(num_sacks != 1) {
1458                 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1459                         if(swalk == sp)
1460                                 continue;
1461
1462                         /* First case, bottom of SP moves into top of the
1463                          * sequence space of SWALK.
1464                          */
1465                         if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1466                                 sp->start_seq = swalk->start_seq;
1467                                 goto coalesce;
1468                         }
1469                         /* Second case, top of SP moves into bottom of the
1470                          * sequence space of SWALK.
1471                          */
1472                         if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1473                                 sp->end_seq = swalk->end_seq;
1474                                 goto coalesce;
1475                         }
1476                 }
1477         }
1478         /* SP is the only SACK, or no coalescing cases found. */
1479         return;
1480
1481 coalesce:
1482         /* Zap SWALK, by moving every further SACK up by one slot.
1483          * Decrease num_sacks.
1484          */
1485         for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1486                 struct tcp_sack_block *next = (swalk + 1);
1487                 swalk->start_seq = next->start_seq;
1488                 swalk->end_seq = next->end_seq;
1489         }
1490         tp->num_sacks--;
1491 }
1492
1493 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1494 {
1495         __u32 tmp;
1496
1497         tmp = sack1->start_seq;
1498         sack1->start_seq = sack2->start_seq;
1499         sack2->start_seq = tmp;
1500
1501         tmp = sack1->end_seq;
1502         sack1->end_seq = sack2->end_seq;
1503         sack2->end_seq = tmp;
1504 }
1505
1506 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1507 {
1508         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1509         struct tcp_sack_block *sp = &tp->selective_acks[0];
1510         int cur_sacks = tp->num_sacks;
1511
1512         if (!cur_sacks)
1513                 goto new_sack;
1514
1515         /* Optimize for the common case, new ofo frames arrive
1516          * "in order". ;-)  This also satisfies the requirements
1517          * of RFC2018 about ordering of SACKs.
1518          */
1519         if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1520                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1521                 tcp_sack_maybe_coalesce(tp, sp);
1522         } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1523                 /* Re-ordered arrival, in this case, can be optimized
1524                  * as well.
1525                  */
1526                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1527                 tcp_sack_maybe_coalesce(tp, sp);
1528         } else {
1529                 struct tcp_sack_block *swap = sp + 1;
1530                 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1531
1532                 /* Oh well, we have to move things around.
1533                  * Try to find a SACK we can tack this onto.
1534                  */
1535
1536                 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1537                         if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1538                            (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1539                                 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1540                                         swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1541                                 else
1542                                         swap->start_seq = TCP_SKB_CB(skb)->seq;
1543                                 tcp_sack_swap(sp, swap);
1544                                 tcp_sack_maybe_coalesce(tp, sp);
1545                                 return;
1546                         }
1547                 }
1548
1549                 /* Could not find an adjacent existing SACK, build a new one,
1550                  * put it at the front, and shift everyone else down.  We
1551                  * always know there is at least one SACK present already here.
1552                  *
1553                  * If the sack array is full, forget about the last one.
1554                  */
1555                 if (cur_sacks >= max_sacks) {
1556                         cur_sacks--;
1557                         tp->num_sacks--;
1558                 }
1559                 while(cur_sacks >= 1) {
1560                         struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1561                         struct tcp_sack_block *prev = (this - 1);
1562                         this->start_seq = prev->start_seq;
1563                         this->end_seq = prev->end_seq;
1564                         cur_sacks--;
1565                 }
1566
1567         new_sack:
1568                 /* Build the new head SACK, and we're done. */
1569                 sp->start_seq = TCP_SKB_CB(skb)->seq;
1570                 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1571                 tp->num_sacks++;
1572         }
1573 }
1574
1575 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1576 {
1577         struct tcp_sack_block *sp = &tp->selective_acks[0];
1578         int num_sacks = tp->num_sacks;
1579         int this_sack;
1580
1581         /* This is an in order data segment _or_ an out-of-order SKB being
1582          * moved to the receive queue, so we know this removed SKB will eat
1583          * from the front of a SACK.
1584          */
1585         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1586                 /* Check if the start of the sack is covered by skb. */
1587                 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1588                    before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1589                         break;
1590         }
1591
1592         /* This should only happen if so many SACKs get built that some get
1593          * pushed out before we get here, or we eat some in sequence packets
1594          * which are before the first SACK block.
1595          */
1596         if(this_sack >= num_sacks)
1597                 return;
1598
1599         sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1600         if(!before(sp->start_seq, sp->end_seq)) {
1601                 /* Zap this SACK, by moving forward any other SACKS. */
1602                 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1603                         struct tcp_sack_block *next = (sp + 1);
1604                         sp->start_seq = next->start_seq;
1605                         sp->end_seq = next->end_seq;
1606                 }
1607                 tp->num_sacks--;
1608         }
1609 }
1610
1611 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1612 {
1613         struct tcp_sack_block *sp = &tp->selective_acks[0];
1614         int num_sacks = tp->num_sacks;
1615         int this_sack;
1616
1617         for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1618                 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1619                         break;
1620         }
1621         if(this_sack >= num_sacks)
1622                 return;
1623         sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1624 }
1625
1626 /* This one checks to see if we can put data from the
1627  * out_of_order queue into the receive_queue.
1628  */
1629 static void tcp_ofo_queue(struct sock *sk)
1630 {
1631         struct sk_buff *skb;
1632         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1633
1634         while ((skb = skb_peek(&tp->out_of_order_queue))) {
1635                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1636                         break;
1637
1638                 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1639                         SOCK_DEBUG(sk, "ofo packet was already received \n");
1640                         __skb_unlink(skb, skb->list);
1641                         kfree_skb(skb);
1642                         continue;
1643                 }
1644                 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1645                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1646                            TCP_SKB_CB(skb)->end_seq);
1647
1648                 if(tp->sack_ok)
1649                         tcp_sack_remove_skb(tp, skb);
1650                 __skb_unlink(skb, skb->list);
1651                 __skb_queue_tail(&sk->receive_queue, skb);
1652                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1653                 if(skb->h.th->fin)
1654                         tcp_fin(skb, sk, skb->h.th);
1655         }
1656 }
1657
1658 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1659 {
1660         struct sk_buff *skb1;
1661         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1662
1663         /*  Queue data for delivery to the user.
1664          *  Packets in sequence go to the receive queue.
1665          *  Out of sequence packets to the out_of_order_queue.
1666          */
1667         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1668                 /* Ok. In sequence. */
1669         queue_and_out:
1670                 dst_confirm(sk->dst_cache);
1671                 __skb_queue_tail(&sk->receive_queue, skb);
1672                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1673                 if(skb->h.th->fin) {
1674                         tcp_fin(skb, sk, skb->h.th);
1675                 } else {
1676                         tcp_remember_ack(tp, skb->h.th, skb);
1677                 }
1678                 /* This may have eaten into a SACK block. */
1679                 if(tp->sack_ok && tp->num_sacks)
1680                         tcp_sack_remove_skb(tp, skb);
1681                 tcp_ofo_queue(sk);
1682
1683                 /* Turn on fast path. */
1684                 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1685                         tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1686                                                ntohl(TCP_FLAG_ACK) |
1687                                                tp->snd_wnd);
1688                 return;
1689         }
1690
1691         /* An old packet, either a retransmit or some packet got lost. */
1692         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1693                 /* A retransmit, 2nd most common case.  Force an imediate ack. */
1694                 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1695                 tcp_enter_quickack_mode(tp);
1696                 kfree_skb(skb);
1697                 return;
1698         }
1699
1700         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1701                 /* Partial packet, seq < rcv_next < end_seq */
1702                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1703                            tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1704                            TCP_SKB_CB(skb)->end_seq);
1705
1706                 goto queue_and_out;
1707         }
1708
1709         /* Ok. This is an out_of_order segment, force an ack. */
1710         tp->delayed_acks++;
1711         tcp_enter_quickack_mode(tp);
1712
1713         /* Disable header prediction. */
1714         tp->pred_flags = 0;
1715
1716         SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1717                    tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1718
1719         if (skb_peek(&tp->out_of_order_queue) == NULL) {
1720                 /* Initial out of order segment, build 1 SACK. */
1721                 if(tp->sack_ok) {
1722                         tp->num_sacks = 1;
1723                         tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1724                         tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1725                 }
1726                 __skb_queue_head(&tp->out_of_order_queue,skb);
1727         } else {
1728                 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1729                         /* Already there. */
1730                         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1731                                 if (skb->len >= skb1->len) {
1732                                         if(tp->sack_ok)
1733                                                 tcp_sack_extend(tp, skb1, skb);
1734                                         __skb_append(skb1, skb);
1735                                         __skb_unlink(skb1, skb1->list);
1736                                         kfree_skb(skb1);
1737                                 } else {
1738                                         /* A duplicate, smaller than what is in the
1739                                          * out-of-order queue right now, toss it.
1740                                          */
1741                                         kfree_skb(skb);
1742                                 }
1743                                 break;
1744                         }
1745
1746                         if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1747                                 __skb_append(skb1, skb);
1748                                 if(tp->sack_ok)
1749                                         tcp_sack_new_ofo_skb(sk, skb);
1750                                 break;
1751                         }
1752
1753                         /* See if we've hit the start. If so insert. */
1754                         if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1755                                 __skb_queue_head(&tp->out_of_order_queue,skb);
1756                                 if(tp->sack_ok)
1757                                         tcp_sack_new_ofo_skb(sk, skb);
1758                                 break;
1759                         }
1760                 }
1761         }
1762 }
1763
1764
1765 /*
1766  *      This routine handles the data.  If there is room in the buffer,
1767  *      it will be have already been moved into it.  If there is no
1768  *      room, then we will just have to discard the packet.
1769  */
1770
1771 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1772 {
1773         struct tcphdr *th;
1774         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1775
1776         th = skb->h.th;
1777         skb_pull(skb, th->doff*4);
1778         skb_trim(skb, len - (th->doff*4));
1779
1780         if (skb->len == 0 && !th->fin)
1781                 return(0);
1782
1783         /*
1784          *      If our receive queue has grown past its limits shrink it.
1785          *      Make sure to do this before moving snd_nxt, otherwise
1786          *      data might be acked for that we don't have enough room.
1787          */
1788         if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1789                 if (prune_queue(sk) < 0) {
1790                         /* Still not enough room. That can happen when
1791                          * skb->true_size differs significantly from skb->len.
1792                          */
1793                         return 0;
1794                 }
1795         }
1796
1797         tcp_data_queue(sk, skb);
1798
1799         if (before(tp->rcv_nxt, tp->copied_seq)) {
1800                 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1801                 tp->rcv_nxt = tp->copied_seq;
1802         }
1803
1804         /* Above, tcp_data_queue() increments delayed_acks appropriately.
1805          * Now tell the user we may have some data.
1806          */
1807         if (!sk->dead) {
1808                 wake_up_interruptible(sk->sleep);
1809                 sock_wake_async(sk->socket,1);
1810         }
1811         return(1);
1812 }
1813
1814 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1815 {
1816         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1817
1818         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1819             tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1820                 /* Put more data onto the wire. */
1821                 tcp_write_xmit(sk);
1822         } else if (tp->packets_out == 0 && !tp->pending) {
1823                 /* Start probing the receivers window. */
1824                 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1825         }
1826 }
1827
1828 static __inline__ void tcp_data_snd_check(struct sock *sk)
1829 {
1830         struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1831
1832         if (skb != NULL)
1833                 __tcp_data_snd_check(sk, skb);
1834 }
1835
1836 /*
1837  * Adapt the MSS value used to make delayed ack decision to the
1838  * real world.
1839  *
1840  * The constant 536 hasn't any good meaning.  In IPv4 world
1841  * MTU may be smaller, though it contradicts to RFC1122, which
1842  * states that MSS must be at least 536.
1843  * We use the constant to do not ACK each second
1844  * packet in a stream of tiny size packets.
1845  * It means that super-low mtu links will be aggressively delacked.
1846  * Seems, it is even good. If they have so low mtu, they are weirdly
1847  * slow.
1848  *
1849  * AK: BTW it may be useful to add an option to lock the rcv_mss.
1850  *     this way the beowulf people wouldn't need ugly patches to get the
1851  *     ack frequencies they want and it would be an elegant way to tune delack.
1852  */
1853 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1854 {
1855         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1856         unsigned int len, lss;
1857
1858         lss = tp->last_seg_size;
1859         tp->last_seg_size = 0;
1860
1861         /* skb->len may jitter because of SACKs, even if peer
1862          * sends good full-sized frames.
1863          */
1864         len = skb->len;
1865         if (len >= tp->rcv_mss) {
1866                 tp->rcv_mss = len;
1867         } else {
1868                 /* Otherwise, we make more careful check taking into account,
1869                  * that SACKs block is variable.
1870                  *
1871                  * "len" is invariant segment length, including TCP header.
1872                  */
1873                 len = skb->tail - skb->h.raw;
1874                 if (len >= 536 + sizeof(struct tcphdr)) {
1875                         /* Subtract also invariant (if peer is RFC compliant),
1876                          * tcp header plus fixed timestamp option length.
1877                          * Resulting "len" is MSS free of SACK jitter.
1878                          */
1879                         len -= tp->tcp_header_len;
1880                         if (len == lss)
1881                                 tp->rcv_mss = len;
1882                         tp->last_seg_size = len;
1883                 }
1884         }
1885 }
1886
1887 /*
1888  * Check if sending an ack is needed.
1889  */
1890 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
1891 {
1892         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1893
1894         /* This also takes care of updating the window.
1895          * This if statement needs to be simplified.
1896          *
1897          * Rules for delaying an ack:
1898          *      - delay time <= 0.5 HZ
1899          *      - we don't have a window update to send
1900          *      - must send at least every 2 full sized packets
1901          *      - must send an ACK if we have any out of order data
1902          *
1903          * With an extra heuristic to handle loss of packet
1904          * situations and also helping the sender leave slow
1905          * start in an expediant manner.
1906          */
1907
1908             /* Two full frames received or... */
1909         if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1910             /* We will update the window "significantly" or... */
1911             tcp_raise_window(sk) ||
1912             /* We entered "quick ACK" mode or... */
1913             tcp_in_quickack_mode(tp) ||
1914             /* We have out of order data */
1915             (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
1916                 /* Then ack it now */
1917                 tcp_send_ack(sk);
1918         } else {
1919                 /* Else, send delayed ack. */
1920                 tcp_send_delayed_ack(sk, HZ/2);
1921         }
1922 }
1923
1924 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1925 {
1926         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1927         if (tp->delayed_acks == 0) {
1928                 /* We sent a data segment already. */
1929                 return;
1930         }
1931         __tcp_ack_snd_check(sk, 1);
1932 }
1933
1934
1935 /*
1936  *      This routine is only called when we have urgent data
1937  *      signalled. Its the 'slow' part of tcp_urg. It could be
1938  *      moved inline now as tcp_urg is only called from one
1939  *      place. We handle URGent data wrong. We have to - as
1940  *      BSD still doesn't use the correction from RFC961.
1941  *      For 1003.1g we should support a new option TCP_STDURG to permit
1942  *      either form (or just set the sysctl tcp_stdurg).
1943  */
1944
1945 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1946 {
1947         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1948         u32 ptr = ntohs(th->urg_ptr);
1949
1950         if (ptr && !sysctl_tcp_stdurg)
1951                 ptr--;
1952         ptr += ntohl(th->seq);
1953
1954         /* Ignore urgent data that we've already seen and read. */
1955         if (after(tp->copied_seq, ptr))
1956                 return;
1957
1958         /* Do we already have a newer (or duplicate) urgent pointer? */
1959         if (tp->urg_data && !after(ptr, tp->urg_seq))
1960                 return;
1961
1962         /* Tell the world about our new urgent pointer. */
1963         if (sk->proc != 0) {
1964                 if (sk->proc > 0)
1965                         kill_proc(sk->proc, SIGURG, 1);
1966                 else
1967                         kill_pg(-sk->proc, SIGURG, 1);
1968         }
1969
1970         /* We may be adding urgent data when the last byte read was
1971          * urgent. To do this requires some care. We cannot just ignore
1972          * tp->copied_seq since we would read the last urgent byte again
1973          * as data, nor can we alter copied_seq until this data arrives
1974          * or we break the sematics of SIOCATMARK (and thus sockatmark())
1975          */
1976         if (tp->urg_seq == tp->copied_seq)
1977                 tp->copied_seq++;       /* Move the copied sequence on correctly */
1978         tp->urg_data = URG_NOTYET;
1979         tp->urg_seq = ptr;
1980
1981         /* Disable header prediction. */
1982         tp->pred_flags = 0;
1983 }
1984
1985 /* This is the 'fast' part of urgent handling. */
1986 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1987 {
1988         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1989
1990         /* Check if we get a new urgent pointer - normally not. */
1991         if (th->urg)
1992                 tcp_check_urg(sk,th);
1993
1994         /* Do we wait for any urgent data? - normally not... */
1995         if (tp->urg_data == URG_NOTYET) {
1996                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1997
1998                 /* Is the urgent pointer pointing into this packet? */
1999                 if (ptr < len) {
2000                         tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
2001                         if (!sk->dead)
2002                                 sk->data_ready(sk,0);
2003                 }
2004         }
2005 }
2006
2007 /* Clean the out_of_order queue if we can, trying to get
2008  * the socket within its memory limits again.
2009  *
2010  * Return less than zero if we should start dropping frames
2011  * until the socket owning process reads some of the data
2012  * to stabilize the situation.
2013  */
2014 static int prune_queue(struct sock *sk)
2015 {
2016         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2017         struct sk_buff * skb;
2018
2019         SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2020
2021         net_statistics.PruneCalled++;
2022
2023         /* First, purge the out_of_order queue. */
2024         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2025         if(skb != NULL) {
2026                 /* Free it all. */
2027                 do {    net_statistics.OfoPruned += skb->len;
2028                         kfree_skb(skb);
2029                         skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2030                 } while(skb != NULL);
2031
2032                 /* Reset SACK state.  A conforming SACK implementation will
2033                  * do the same at a timeout based retransmit.  When a connection
2034                  * is in a sad state like this, we care only about integrity
2035                  * of the connection not performance.
2036                  */
2037                 if(tp->sack_ok)
2038                         tp->num_sacks = 0;
2039         }
2040
2041         /* If we are really being abused, tell the caller to silently
2042          * drop receive data on the floor.  It will get retransmitted
2043          * and hopefully then we'll have sufficient space.
2044          *
2045          * We used to try to purge the in-order packets too, but that
2046          * turns out to be deadly and fraught with races.  Consider:
2047          *
2048          * 1) If we acked the data, we absolutely cannot drop the
2049          *    packet.  This data would then never be retransmitted.
2050          * 2) It is possible, with a proper sequence of events involving
2051          *    delayed acks and backlog queue handling, to have the user
2052          *    read the data before it gets acked.  The previous code
2053          *    here got this wrong, and it lead to data corruption.
2054          * 3) Too much state changes happen when the FIN arrives, so once
2055          *    we've seen that we can't remove any in-order data safely.
2056          *
2057          * The net result is that removing in-order receive data is too
2058          * complex for anyones sanity.  So we don't do it anymore.  But
2059          * if we are really having our buffer space abused we stop accepting
2060          * new receive data.
2061          *
2062          * FIXME: it should recompute SACK state and only remove enough
2063          *        buffers to get into bounds again. The current scheme loses
2064      *        badly sometimes on links with large RTT, especially when
2065      *        the driver has high overhead per skb.
2066      *        (increasing the rcvbuf is not enough because it inflates the
2067      *         the window too, disabling flow control effectively) -AK
2068          */
2069         if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2070                 return 0;
2071
2072         /* Massive buffer overcommit. */
2073         return -1;
2074 }
2075
2076 /*
2077  *      TCP receive function for the ESTABLISHED state.
2078  *
2079  *      It is split into a fast path and a slow path. The fast path is
2080  *      disabled when:
2081  *      - A zero window was announced from us - zero window probing
2082  *        is only handled properly in the slow path.
2083  *  - Out of order segments arrived.
2084  *      - Urgent data is expected.
2085  *      - There is no buffer space left
2086  *      - Unexpected TCP flags/window values/header lengths are received
2087  *        (detected by checking the TCP header against pred_flags)
2088  *      - Data is sent in both directions. Fast path only supports pure senders
2089  *        or pure receivers (this means either the sequence number or the ack
2090  *        value must stay constant)
2091  *  - Unexpected TCP option.
2092  *
2093  *      When these conditions are not satisfied it drops into a standard
2094  *      receive procedure patterned after RFC793 to handle all cases.
2095  *      The first three cases are guaranteed by proper pred_flags setting,
2096  *      the rest is checked inline. Fast processing is turned on in
2097  *      tcp_data_queue when everything is OK.
2098  */
2099 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2100                         struct tcphdr *th, unsigned len)
2101 {
2102         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2103
2104         /*
2105          *      Header prediction.
2106          *      The code losely follows the one in the famous
2107          *      "30 instruction TCP receive" Van Jacobson mail.
2108          *
2109          *      Van's trick is to deposit buffers into socket queue
2110          *      on a device interrupt, to call tcp_recv function
2111          *      on the receive process context and checksum and copy
2112          *      the buffer to user space. smart...
2113          *
2114          *      Our current scheme is not silly either but we take the
2115          *      extra cost of the net_bh soft interrupt processing...
2116          *      We do checksum and copy also but from device to kernel.
2117          */
2118
2119
2120         /* RED-PEN. Using static variables to pass function arguments
2121          * cannot be good idea...
2122          */
2123         tp->saw_tstamp = 0;
2124
2125         /*      pred_flags is 0xS?10 << 16 + snd_wnd
2126          *      if header_predition is to be made
2127          *      'S' will always be tp->tcp_header_len >> 2
2128          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2129          *  turn it off (when there are holes in the receive
2130          *       space for instance)
2131          *      PSH flag is ignored.
2132          */
2133
2134         if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2135                 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2136                 int tcp_header_len = th->doff*4;
2137
2138                 /* Timestamp header prediction */
2139
2140                 /* Non-standard header f.e. SACKs -> slow path */
2141                 if (tcp_header_len != tp->tcp_header_len)
2142                         goto slow_path;
2143
2144                 /* Check timestamp */
2145                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2146                         __u32 *ptr = (__u32 *)(th + 1);
2147
2148                         /* No? Slow path! */
2149                         if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2150                                                      | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2151                                 goto slow_path;
2152
2153                         tp->saw_tstamp = 1;
2154                         ++ptr;
2155                         tp->rcv_tsval = ntohl(*ptr);
2156                         ++ptr;
2157                         tp->rcv_tsecr = ntohl(*ptr);
2158
2159                         /* If PAWS failed, check it more carefully in slow path */
2160                         if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2161                                 goto slow_path;
2162
2163                         /* Predicted packet is in window by definition.
2164                            seq == rcv_nxt and last_ack_sent <= rcv_nxt.
2165                            Hence, check seq<=last_ack_sent reduces to:
2166                          */
2167                         if (tp->rcv_nxt == tp->last_ack_sent) {
2168                                 tp->ts_recent = tp->rcv_tsval;
2169                                 tp->ts_recent_stamp = xtime.tv_sec;
2170                         }
2171                 }
2172
2173                 if (len <= tcp_header_len) {
2174                         /* Bulk data transfer: sender */
2175                         if (len == tcp_header_len) {
2176                                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2177                                         TCP_SKB_CB(skb)->ack_seq, len);
2178                                 kfree_skb(skb);
2179                                 tcp_data_snd_check(sk);
2180                                 return 0;
2181                         } else { /* Header too small */
2182                                 tcp_statistics.TcpInErrs++;
2183                                 goto discard;
2184                         }
2185                 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
2186                            atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
2187                         /* Bulk data transfer: receiver */
2188                         __skb_pull(skb,tcp_header_len);
2189
2190                         /* Is it possible to simplify this? */
2191                         tcp_measure_rcv_mss(sk, skb);
2192
2193                         /* DO NOT notify forward progress here.
2194                          * It saves dozen of CPU instructions in fast path. --ANK
2195                          * And where is it signaled then ? -AK
2196                          */
2197                         __skb_queue_tail(&sk->receive_queue, skb);
2198                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2199
2200                         /* FIN bit check is not done since if FIN is set in
2201                          * this frame, the pred_flags won't match up. -DaveM
2202                          */
2203                         wake_up_interruptible(sk->sleep);
2204                         sock_wake_async(sk->socket,1);
2205                         tcp_delack_estimator(tp);
2206
2207                         tcp_remember_ack(tp, th, skb);
2208
2209                         __tcp_ack_snd_check(sk, 0);
2210                         return 0;
2211                 }
2212                 /* Packet is in sequence, flags are trivial;
2213                  * only ACK is strange or we are tough on memory.
2214                  * Jump to step 5.
2215                  */
2216                 goto step5;
2217         }
2218
2219 slow_path:
2220         /*
2221          * RFC1323: H1. Apply PAWS check first.
2222          */
2223         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2224             tcp_paws_discard(tp, skb)) {
2225                 if (!th->rst) {
2226                         tcp_send_ack(sk);
2227                         goto discard;
2228                 }
2229                 /* Resets are accepted even if PAWS failed.
2230
2231                    ts_recent update must be made after we are sure
2232                    that the packet is in window.
2233                  */
2234         }
2235
2236         /*
2237          *      Standard slow path.
2238          */
2239
2240         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2241                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2242                  * (RST) segments are validated by checking their SEQ-fields."
2243                  * And page 69: "If an incoming segment is not acceptable,
2244                  * an acknowledgment should be sent in reply (unless the RST bit
2245                  * is set, if so drop the segment and return)".
2246                  */
2247                 if (th->rst)
2248                         goto discard;
2249                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2250                         SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2251                                    TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2252                                    tp->rcv_wup, tp->rcv_wnd);
2253                 }
2254                 tcp_send_ack(sk);
2255                 goto discard;
2256         }
2257
2258         if(th->rst) {
2259                 tcp_reset(sk);
2260                 goto discard;
2261         }
2262
2263         if (tp->saw_tstamp) {
2264                 tcp_replace_ts_recent(sk, tp,
2265                                       TCP_SKB_CB(skb)->seq);
2266         }
2267
2268         if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2269                 SOCK_DEBUG(sk, "syn in established state\n");
2270                 tcp_statistics.TcpInErrs++;
2271                 tcp_reset(sk);
2272                 return 1;
2273         }
2274
2275 step5:
2276         if(th->ack)
2277                 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2278
2279         /* Process urgent data. */
2280         tcp_urg(sk, th, len);
2281
2282         {
2283         /* step 7: process the segment text */
2284         int queued = tcp_data(skb, sk, len);
2285
2286         tcp_measure_rcv_mss(sk, skb);
2287
2288         /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2289         if(sk->state != TCP_CLOSE) {
2290                 tcp_data_snd_check(sk);
2291                 tcp_ack_snd_check(sk);
2292         }
2293
2294         if (!queued) {
2295         discard:
2296                 kfree_skb(skb);
2297         }
2298         }
2299
2300         return 0;
2301 }
2302
2303
2304 /* This is not only more efficient than what we used to do, it eliminates
2305  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2306  *
2307  * Actually, we could lots of memory writes here. tp of listening
2308  * socket contains all necessary default parameters.
2309  */
2310 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2311 {
2312         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2313
2314         if(newsk != NULL) {
2315                 struct tcp_opt *newtp;
2316 #ifdef CONFIG_FILTER
2317                 struct sk_filter *filter;
2318 #endif
2319
2320                 memcpy(newsk, sk, sizeof(*newsk));
2321                 newsk->state = TCP_SYN_RECV;
2322
2323                 /* SANITY */
2324                 newsk->pprev = NULL;
2325                 newsk->prev = NULL;
2326
2327                 /* Clone the TCP header template */
2328                 newsk->dport = req->rmt_port;
2329
2330                 sock_lock_init(newsk);
2331
2332                 atomic_set(&newsk->rmem_alloc, 0);
2333                 skb_queue_head_init(&newsk->receive_queue);
2334                 atomic_set(&newsk->wmem_alloc, 0);
2335                 skb_queue_head_init(&newsk->write_queue);
2336                 atomic_set(&newsk->omem_alloc, 0);
2337
2338                 newsk->done = 0;
2339                 newsk->proc = 0;
2340                 newsk->backlog.head = newsk->backlog.tail = NULL;
2341                 skb_queue_head_init(&newsk->error_queue);
2342                 newsk->write_space = tcp_write_space;
2343 #ifdef CONFIG_FILTER
2344                 if ((filter = newsk->filter) != NULL)
2345                         sk_filter_charge(newsk, filter);
2346 #endif
2347
2348                 /* Now setup tcp_opt */
2349                 newtp = &(newsk->tp_pinfo.af_tcp);
2350                 newtp->pred_flags = 0;
2351                 newtp->rcv_nxt = req->rcv_isn + 1;
2352                 newtp->snd_nxt = req->snt_isn + 1;
2353                 newtp->snd_una = req->snt_isn + 1;
2354                 newtp->srtt = 0;
2355                 newtp->ato = 0;
2356                 newtp->snd_wl1 = req->rcv_isn;
2357                 newtp->snd_wl2 = req->snt_isn;
2358
2359                 /* RFC1323: The window in SYN & SYN/ACK segments
2360                  * is never scaled.
2361                  */
2362                 newtp->snd_wnd = ntohs(skb->h.th->window);
2363
2364                 newtp->max_window = newtp->snd_wnd;
2365                 newtp->pending = 0;
2366                 newtp->retransmits = 0;
2367                 newtp->last_ack_sent = req->rcv_isn + 1;
2368                 newtp->backoff = 0;
2369                 newtp->mdev = TCP_TIMEOUT_INIT;
2370
2371                 /* So many TCP implementations out there (incorrectly) count the
2372                  * initial SYN frame in their delayed-ACK and congestion control
2373                  * algorithms that we must have the following bandaid to talk
2374                  * efficiently to them.  -DaveM
2375                  */
2376                 newtp->snd_cwnd = 2;
2377
2378                 newtp->rto = TCP_TIMEOUT_INIT;
2379                 newtp->packets_out = 0;
2380                 newtp->fackets_out = 0;
2381                 newtp->retrans_out = 0;
2382                 newtp->high_seq = 0;
2383                 newtp->snd_ssthresh = 0x7fffffff;
2384                 newtp->snd_cwnd_cnt = 0;
2385                 newtp->dup_acks = 0;
2386                 newtp->delayed_acks = 0;
2387                 init_timer(&newtp->retransmit_timer);
2388                 newtp->retransmit_timer.function = &tcp_retransmit_timer;
2389                 newtp->retransmit_timer.data = (unsigned long) newsk;
2390                 init_timer(&newtp->delack_timer);
2391                 newtp->delack_timer.function = &tcp_delack_timer;
2392                 newtp->delack_timer.data = (unsigned long) newsk;
2393                 skb_queue_head_init(&newtp->out_of_order_queue);
2394                 newtp->send_head = newtp->retrans_head = NULL;
2395                 newtp->rcv_wup = req->rcv_isn + 1;
2396                 newtp->write_seq = req->snt_isn + 1;
2397                 newtp->copied_seq = req->rcv_isn + 1;
2398
2399                 newtp->saw_tstamp = 0;
2400
2401                 init_timer(&newtp->probe_timer);
2402                 newtp->probe_timer.function = &tcp_probe_timer;
2403                 newtp->probe_timer.data = (unsigned long) newsk;
2404                 newtp->probes_out = 0;
2405                 newtp->syn_seq = req->rcv_isn;
2406                 newtp->fin_seq = req->rcv_isn;
2407                 newtp->urg_data = 0;
2408                 tcp_synq_init(newtp);
2409                 newtp->syn_backlog = 0;
2410                 if (skb->len >= 536)
2411                         newtp->last_seg_size = skb->len;
2412
2413                 /* Back to base struct sock members. */
2414                 newsk->err = 0;
2415                 newsk->ack_backlog = 0;
2416                 newsk->max_ack_backlog = SOMAXCONN;
2417                 newsk->priority = 0;
2418                 atomic_set(&newsk->refcnt, 1);
2419                 atomic_inc(&inet_sock_nr);
2420
2421                 spin_lock_init(&sk->timer_lock);
2422                 init_timer(&newsk->timer);
2423                 newsk->timer.function = &tcp_keepalive_timer;
2424                 newsk->timer.data = (unsigned long) newsk;
2425                 if (newsk->keepopen)
2426                         tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2427                 newsk->socket = NULL;
2428                 newsk->sleep = NULL;
2429
2430                 newtp->tstamp_ok = req->tstamp_ok;
2431                 if((newtp->sack_ok = req->sack_ok) != 0)
2432                         newtp->num_sacks = 0;
2433                 newtp->window_clamp = req->window_clamp;
2434                 newtp->rcv_wnd = req->rcv_wnd;
2435                 newtp->wscale_ok = req->wscale_ok;
2436                 if (newtp->wscale_ok) {
2437                         newtp->snd_wscale = req->snd_wscale;
2438                         newtp->rcv_wscale = req->rcv_wscale;
2439                 } else {
2440                         newtp->snd_wscale = newtp->rcv_wscale = 0;
2441                         newtp->window_clamp = min(newtp->window_clamp,65535);
2442                 }
2443                 if (newtp->tstamp_ok) {
2444                         newtp->ts_recent = req->ts_recent;
2445                         newtp->ts_recent_stamp = xtime.tv_sec;
2446                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2447                 } else {
2448                         newtp->ts_recent_stamp = 0;
2449                         newtp->tcp_header_len = sizeof(struct tcphdr);
2450                 }
2451                 newtp->mss_clamp = req->mss;
2452         }
2453         return newsk;
2454 }
2455
2456 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
2457 {
2458         if (seq == s_win)
2459                 return 1;
2460         if (after(end_seq, s_win) && before(seq, e_win))
2461                 return 1;
2462         return (seq == e_win && seq == end_seq);
2463 }
2464
2465
2466 /*
2467  *      Process an incoming packet for SYN_RECV sockets represented
2468  *      as an open_request.
2469  */
2470
2471 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2472                            struct open_request *req,
2473                            struct open_request *prev)
2474 {
2475         struct tcphdr *th = skb->h.th;
2476         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2477         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2478         int paws_reject = 0;
2479         struct tcp_opt ttp;
2480
2481         /* If socket has already been created, process
2482            packet in its context.
2483
2484            We fall here only due to race, when packets were enqueued
2485            to backlog of listening socket.
2486          */
2487         if (req->sk)
2488                 return req->sk;
2489
2490         ttp.saw_tstamp = 0;
2491         if (th->doff > (sizeof(struct tcphdr)>>2)) {
2492
2493                 tcp_parse_options(NULL, th, &ttp, 0);
2494
2495                 paws_reject = ttp.saw_tstamp &&
2496                         (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
2497         }
2498
2499         /* Check for pure retransmited SYN. */
2500         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2501             flg == TCP_FLAG_SYN &&
2502             !paws_reject) {
2503                 /*
2504                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2505                  * this case on figure 6 and figure 8, but formal
2506                  * protocol description says NOTHING.
2507                  * To be more exact, it says that we should send ACK,
2508                  * because this segment (at least, if it has no data)
2509                  * is out of window.
2510                  *
2511                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2512                  *  describe SYN-RECV state. All the description
2513                  *  is wrong, we cannot believe to it and should
2514                  *  rely only on common sense and implementation
2515                  *  experience.
2516                  *
2517                  * Enforce "SYN-ACK" according to figure 8, figure 6
2518                  * of RFC793, fixed by RFC1122.
2519                  */
2520                 req->class->rtx_syn_ack(sk, req);
2521                 return NULL;
2522         }
2523
2524         /* Further reproduces section "SEGMENT ARRIVES"
2525            for state SYN-RECEIVED of RFC793.
2526            It is broken, however, it does not work only
2527            when SYNs are crossed, which is impossible in our
2528            case.
2529
2530            But generally, we should (RFC lies!) to accept ACK
2531            from SYNACK both here and in tcp_rcv_state_process().
2532            tcp_rcv_state_process() does not, hence, we do not too.
2533
2534            Note that the case is absolutely generic:
2535            we cannot optimize anything here without
2536            violating protocol. All the checks must be made
2537            before attempt to create socket.
2538          */
2539
2540         /* RFC793: "first check sequence number". */
2541
2542         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2543                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2544                 /* Out of window: send ACK and drop. */
2545                 if (!(flg & TCP_FLAG_RST))
2546                         req->class->send_ack(skb, req);
2547                 return NULL;
2548         }
2549
2550         /* In sequence, PAWS is OK. */
2551
2552         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2553                 req->ts_recent = ttp.rcv_tsval;
2554
2555         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2556                 /* Truncate SYN, it is out of window starting
2557                    at req->rcv_isn+1. */
2558                 flg &= ~TCP_FLAG_SYN;
2559         }
2560
2561         /* RFC793: "second check the RST bit" and
2562          *         "fourth, check the SYN bit"
2563          */
2564         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2565                 goto embryonic_reset;
2566
2567         /* RFC793: "fifth check the ACK field" */
2568
2569         if (!(flg & TCP_FLAG_ACK))
2570                 return NULL;
2571
2572         /* Invalid ACK: reset will be sent by listening socket */
2573         if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2574                 return sk;
2575
2576         /* OK, ACK is valid, create big socket and
2577            feed this segment to it. It will repeat all
2578            the tests. THIS SEGMENT MUST MOVE SOCKET TO
2579            ESTABLISHED STATE. If it will be dropped after
2580            socket is created, wait for troubles.
2581          */
2582         sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2583         if (sk == NULL)
2584                 return NULL;
2585
2586         tcp_dec_slow_timer(TCP_SLT_SYNACK);
2587         req->sk = sk;
2588         return sk;
2589
2590 embryonic_reset:
2591         tcp_synq_unlink(tp, req, prev);
2592         tp->syn_backlog--;
2593         tcp_dec_slow_timer(TCP_SLT_SYNACK);
2594
2595         net_statistics.EmbryonicRsts++;
2596         if (!(flg & TCP_FLAG_RST))
2597                 req->class->send_reset(skb);
2598
2599         req->class->destructor(req);
2600         tcp_openreq_free(req);
2601         return NULL;
2602 }
2603
2604 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
2605                                          struct tcphdr *th, unsigned len)
2606 {
2607         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2608
2609         tcp_parse_options(sk, th, tp, 0);
2610
2611 #ifdef CONFIG_TCP_TW_RECYCLE
2612         if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
2613             (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
2614             xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
2615                 /* Old duplicate segment. We remember last
2616                    ts_recent from this host in timewait bucket.
2617
2618                    Actually, we could implement per host cache
2619                    to truncate timewait state after RTO. Paranoidal arguments
2620                    of rfc1337 are not enough to close this nice possibility.
2621                 */
2622                 if (net_ratelimit())
2623                         printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
2624                 if (th->ack)
2625                         return 1;
2626                 goto discard;
2627         }
2628 #endif
2629
2630         if (th->ack) {
2631                 /* rfc793:
2632                  * "If the state is SYN-SENT then
2633                  *    first check the ACK bit
2634                  *      If the ACK bit is set
2635                  *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2636                  *        a reset (unless the RST bit is set, if so drop
2637                  *        the segment and return)"
2638                  *
2639                  *  I cite this place to emphasize one essential
2640                  *  detail, this check is different of one
2641                  *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2642                  *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2643                  *  because we have no previous data sent before SYN.
2644                  *                                        --ANK(990513)
2645                  *
2646                  *  We do not send data with SYN, so that RFC-correct
2647                  *  test reduces to:
2648                  */
2649                 if (sk->zapped ||
2650                     TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2651                         return 1;
2652
2653                 /* Now ACK is acceptable.
2654                  *
2655                  * "If the RST bit is set
2656                  *    If the ACK was acceptable then signal the user "error:
2657                  *    connection reset", drop the segment, enter CLOSED state,
2658                  *    delete TCB, and return."
2659                  */
2660
2661                 if (th->rst) {
2662                         tcp_reset(sk);
2663                         goto discard;
2664                 }
2665
2666                 /* rfc793:
2667                  *   "fifth, if neither of the SYN or RST bits is set then
2668                  *    drop the segment and return."
2669                  *
2670                  *    See note below!
2671                  *                                        --ANK(990513)
2672                  */
2673                 if (!th->syn)
2674                         goto discard;
2675
2676                 /* rfc793:
2677                  *   "If the SYN bit is on ...
2678                  *    are acceptable then ...
2679                  *    (our SYN has been ACKed), change the connection
2680                  *    state to ESTABLISHED..."
2681                  *
2682                  * Do you see? SYN-less ACKs in SYN-SENT state are
2683                  * completely ignored.
2684                  *
2685                  * The bug causing stalled SYN-SENT sockets
2686                  * was here: tcp_ack advanced snd_una and canceled
2687                  * retransmit timer, so that bare ACK received
2688                  * in SYN-SENT state (even with invalid ack==ISS,
2689                  * because tcp_ack check is too weak for SYN-SENT)
2690                  * causes moving socket to invalid semi-SYN-SENT,
2691                  * semi-ESTABLISHED state and connection hangs.
2692                  *
2693                  * There exist buggy stacks, which really send
2694                  * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2695                  * Actually, if this host did not try to get something
2696                  * from ftp.inr.ac.ru I'd never find this bug 8)
2697                  *
2698                  *                                     --ANK (990514)
2699                  *
2700                  * I was wrong, I apologize. Bare ACK is valid.
2701                  * Actually, RFC793 requires to send such ACK
2702                  * in reply to any out of window packet.
2703                  * It is wrong, but Linux also does it sometimes.
2704                  *                                     --ANK (990724)
2705                  */
2706
2707                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2708                 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2709                         TCP_SKB_CB(skb)->ack_seq, len);
2710
2711                 /* Ok.. it's good. Set up sequence numbers and
2712                  * move to established.
2713                  */
2714                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2715                 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2716
2717                 /* RFC1323: The window in SYN & SYN/ACK segments is
2718                  * never scaled.
2719                  */
2720                 tp->snd_wnd = htons(th->window);
2721                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2722                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2723                 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2724
2725                 tcp_set_state(sk, TCP_ESTABLISHED);
2726
2727                 if (tp->wscale_ok == 0) {
2728                         tp->snd_wscale = tp->rcv_wscale = 0;
2729                         tp->window_clamp = min(tp->window_clamp,65535);
2730                 }
2731
2732                 if (tp->tstamp_ok) {
2733                         tp->tcp_header_len =
2734                                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2735                 } else
2736                         tp->tcp_header_len = sizeof(struct tcphdr);
2737                 if (tp->saw_tstamp) {
2738                         tp->ts_recent = tp->rcv_tsval;
2739                         tp->ts_recent_stamp = xtime.tv_sec;
2740                 }
2741                 tcp_sync_mss(sk, tp->pmtu_cookie);
2742                 tcp_initialize_rcv_mss(sk);
2743                 tcp_init_metrics(sk);
2744
2745                 if (tp->write_pending) {
2746                         /* Save one ACK. Data will be ready after
2747                          * several ticks, if write_pending is set.
2748                          *
2749                          * How to make this correctly?
2750                          */
2751                         tp->delayed_acks++;
2752                         if (tp->ato == 0)
2753                                 tp->ato = tp->rto;
2754                         tcp_send_delayed_ack(sk, tp->rto);
2755                 } else {
2756                         tcp_send_ack(sk);
2757                 }
2758
2759                 tp->copied_seq = tp->rcv_nxt;
2760
2761                 if(!sk->dead) {
2762                         wake_up_interruptible(sk->sleep);
2763                         sock_wake_async(sk->socket, 0);
2764                 }
2765                 return -1;
2766         }
2767
2768         /* No ACK in the segment */
2769
2770         if (th->rst) {
2771                 /* rfc793:
2772                  * "If the RST bit is set
2773                  *
2774                  *      Otherwise (no ACK) drop the segment and return."
2775                  */
2776
2777                 goto discard;
2778         }
2779
2780         if (th->syn) {
2781                 /* We see SYN without ACK. It is attempt of
2782                  *  simultaneous connect with crossed SYNs.
2783                  *
2784                  * The previous version of the code
2785                  * checked for "connecting to self"
2786                  * here. that check is done now in
2787                  * tcp_connect.
2788                  *
2789                  * RED-PEN: BTW, it does not. 8)
2790                  */
2791                 tcp_set_state(sk, TCP_SYN_RECV);
2792                 if (tp->saw_tstamp) {
2793                         tp->ts_recent = tp->rcv_tsval;
2794                         tp->ts_recent_stamp = xtime.tv_sec;
2795                 }
2796
2797                 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2798                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2799
2800                 /* RFC1323: The window in SYN & SYN/ACK segments is
2801                  * never scaled.
2802                  */
2803                 tp->snd_wnd = htons(th->window);
2804                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2805
2806                 tcp_sync_mss(sk, tp->pmtu_cookie);
2807                 tcp_initialize_rcv_mss(sk);
2808
2809                 tcp_send_synack(sk);
2810 #if 0
2811                 /* Note, we could accept data and URG from this segment.
2812                  * There are no obstacles to make this.
2813                  *
2814                  * However, if we ignore data in ACKless segments sometimes,
2815                  * we have no reasons to accept it sometimes.
2816                  * Also, seems the code doing it in step6 of tcp_rcv_state_process
2817                  * is not flawless. So, discard packet for sanity.
2818                  * Uncomment this return to process the data.
2819                  */
2820                 return -1;
2821 #endif
2822         }
2823         /* "fifth, if neither of the SYN or RST bits is set then
2824          * drop the segment and return."
2825          */
2826
2827 discard:
2828         kfree_skb(skb);
2829         return 0;
2830 }
2831
2832
2833 /*
2834  *      This function implements the receiving procedure of RFC 793 for
2835  *      all states except ESTABLISHED and TIME_WAIT.
2836  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2837  *      address independent.
2838  */
2839
2840 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2841                           struct tcphdr *th, unsigned len)
2842 {
2843         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2844         int queued = 0;
2845
2846         tp->saw_tstamp = 0;
2847
2848         switch (sk->state) {
2849         case TCP_CLOSE:
2850                 /* When state == CLOSED, hash lookup always fails.
2851                  *
2852                  * But, there is a back door, the backlog queue.
2853                  * If we have a sequence of packets in the backlog
2854                  * during __release_sock() which have a sequence such
2855                  * that:
2856                  *      packet X        causes entry to TCP_CLOSE state
2857                  *      ...
2858                  *      packet X + N    has FIN bit set
2859                  *
2860                  * We report a (luckily) harmless error in this case.
2861                  * The issue is that backlog queue processing bypasses
2862                  * any hash lookups (we know which socket packets are for).
2863                  * The correct behavior here is what 2.0.x did, since
2864                  * a TCP_CLOSE socket does not exist.  Drop the frame
2865                  * and send a RST back to the other end.
2866                  */
2867
2868                 /* 1. The socket may be moved to TIME-WAIT state.
2869                    2. While this socket was locked, another socket
2870                       with the same identity could be created.
2871                    3. To continue?
2872
2873                    CONCLUSION: discard and only discard!
2874
2875                    Alternative would be relookup and recurse into tcp_v?_rcv
2876                    (not *_do_rcv) to work with timewait and listen states
2877                    correctly.
2878                  */
2879                 goto discard;
2880
2881         case TCP_LISTEN:
2882                 if(th->ack)
2883                         return 1;
2884
2885                 if(th->syn) {
2886                         if(tp->af_specific->conn_request(sk, skb) < 0)
2887                                 return 1;
2888
2889                         /* Now we have several options: In theory there is
2890                          * nothing else in the frame. KA9Q has an option to
2891                          * send data with the syn, BSD accepts data with the
2892                          * syn up to the [to be] advertised window and
2893                          * Solaris 2.1 gives you a protocol error. For now
2894                          * we just ignore it, that fits the spec precisely
2895                          * and avoids incompatibilities. It would be nice in
2896                          * future to drop through and process the data.
2897                          *
2898                          * Now that TTCP is starting to be used we ought to
2899                          * queue this data.
2900                          * But, this leaves one open to an easy denial of
2901                          * service attack, and SYN cookies can't defend
2902                          * against this problem. So, we drop the data
2903                          * in the interest of security over speed.
2904                          */
2905                         goto discard;
2906                 }
2907                 goto discard;
2908
2909         case TCP_SYN_SENT:
2910                 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
2911                 if (queued >= 0)
2912                         return queued;
2913                 queued = 0;
2914                 goto step6;
2915         }
2916
2917         /*   Parse the tcp_options present on this header.
2918          *   By this point we really only expect timestamps.
2919          *   Note that this really has to be here and not later for PAWS
2920          *   (RFC1323) to work.
2921          */
2922         if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2923             tcp_paws_discard(tp, skb)) {
2924                 if (!th->rst) {
2925                         tcp_send_ack(sk);
2926                         goto discard;
2927                 }
2928                 /* Reset is accepted even if it did not pass PAWS. */
2929         }
2930
2931         /* The silly FIN test here is necessary to see an advancing ACK in
2932          * retransmitted FIN frames properly.  Consider the following sequence:
2933          *
2934          *      host1 --> host2         FIN XSEQ:XSEQ(0) ack YSEQ
2935          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ
2936          *      host1 --> host2         XSEQ:XSEQ(0) ack YSEQ+1
2937          *      host2 --> host1         FIN YSEQ:YSEQ(0) ack XSEQ+1     (fails tcp_sequence test)
2938          *
2939          * At this point the connection will deadlock with host1 believing
2940          * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2941          * forever.  The following fix is from Taral (taral@taral.net).
2942          *
2943          * RED-PEN. Seems, the above is not true.
2944          * If at least one end is RFC compliant, it will send ACK to
2945          * out of window FIN and, hence, move peer to TIME-WAIT.
2946          * I comment out this line. --ANK
2947          *
2948          * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
2949          * received in SYN-RECV. The problem is that description of
2950          * segment processing in SYN-RECV state in RFC792 is WRONG.
2951          * Correct check would accept ACK from this SYN-ACK, see
2952          * figures 6 and 8 (fixed by RFC1122). Compare this
2953          * to problem with FIN, they smell similarly. --ANK
2954          */
2955
2956         /* step 1: check sequence number */
2957         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
2958 #if 0
2959             && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
2960 #endif
2961             ) {
2962                 if (!th->rst) {
2963                         tcp_send_ack(sk);
2964                 }
2965                 goto discard;
2966         }
2967
2968         /* step 2: check RST bit */
2969         if(th->rst) {
2970                 tcp_reset(sk);
2971                 goto discard;
2972         }
2973
2974         if (tp->saw_tstamp) {
2975                 tcp_replace_ts_recent(sk, tp,
2976                                       TCP_SKB_CB(skb)->seq);
2977         }
2978
2979         /* step 3: check security and precedence [ignored] */
2980
2981         /*      step 4:
2982          *
2983          *      Check for a SYN, and ensure it matches the SYN we were
2984          *      first sent. We have to handle the rather unusual (but valid)
2985          *      sequence that KA9Q derived products may generate of
2986          *
2987          *      SYN
2988          *                              SYN|ACK Data
2989          *      ACK     (lost)
2990          *                              SYN|ACK Data + More Data
2991          *      .. we must ACK not RST...
2992          *
2993          *      We keep syn_seq as the sequence space occupied by the
2994          *      original syn.
2995          */
2996
2997         if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2998                 tcp_reset(sk);
2999                 return 1;
3000         }
3001
3002         /* step 5: check the ACK field */
3003         if (th->ack) {
3004                 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3005                                          TCP_SKB_CB(skb)->ack_seq, len);
3006
3007                 switch(sk->state) {
3008                 case TCP_SYN_RECV:
3009                         if (acceptable) {
3010                                 tcp_set_state(sk, TCP_ESTABLISHED);
3011                                 tp->copied_seq = tp->rcv_nxt;
3012
3013                                 /* Note, that this wakeup is only for marginal
3014                                    crossed SYN case. Passively open sockets
3015                                    are not waked up, because sk->sleep == NULL
3016                                    and sk->socket == NULL.
3017                                  */
3018                                 if (!sk->dead && sk->sleep) {
3019                                         wake_up_interruptible(sk->sleep);
3020                                         sock_wake_async(sk->socket, 1);
3021                                 }
3022
3023                                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3024                                 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
3025                                 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3026                                 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3027
3028                                 /* tcp_ack considers this ACK as duplicate
3029                                  * and does not calculate rtt. It is wrong.
3030                                  * Fix it at least with timestamps.
3031                                  */
3032                                 if (tp->saw_tstamp && !tp->srtt)
3033                                         tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3034
3035                                 tcp_init_metrics(sk);
3036                         } else {
3037                                 SOCK_DEBUG(sk, "bad ack\n");
3038                                 return 1;
3039                         }
3040                         break;
3041
3042                 case TCP_FIN_WAIT1:
3043                         if (tp->snd_una == tp->write_seq) {
3044                                 sk->shutdown |= SEND_SHUTDOWN;
3045                                 tcp_set_state(sk, TCP_FIN_WAIT2);
3046                                 if (!sk->dead)
3047                                         sk->state_change(sk);
3048                                 else
3049                                         tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
3050                                 dst_confirm(sk->dst_cache);
3051                         }
3052                         break;
3053
3054                 case TCP_CLOSING:
3055                         if (tp->snd_una == tp->write_seq) {
3056                                 tcp_time_wait(sk);
3057                                 goto discard;
3058                         }
3059                         break;
3060
3061                 case TCP_LAST_ACK:
3062                         if (tp->snd_una == tp->write_seq) {
3063                                 tcp_set_state(sk,TCP_CLOSE);
3064                                 tcp_update_metrics(sk);
3065                                 tcp_done(sk);
3066                                 goto discard;
3067                         }
3068                         break;
3069                 }
3070         } else
3071                 goto discard;
3072
3073 step6:
3074         /* step 6: check the URG bit */
3075         tcp_urg(sk, th, len);
3076
3077         /* step 7: process the segment text */
3078         switch (sk->state) {
3079         case TCP_CLOSE_WAIT:
3080         case TCP_CLOSING:
3081                 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3082                         break;
3083
3084         case TCP_FIN_WAIT1:
3085         case TCP_FIN_WAIT2:
3086                 /* RFC 793 says to queue data in these states,
3087                  * RFC 1122 says we MUST send a reset.
3088                  * BSD 4.4 also does reset.
3089                  */
3090                 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
3091                         if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3092                                 tcp_reset(sk);
3093                                 return 1;
3094                         }
3095                 }
3096
3097         case TCP_ESTABLISHED:
3098                 queued = tcp_data(skb, sk, len);
3099
3100                 /* This must be after tcp_data() does the skb_pull() to
3101                  * remove the header size from skb->len.
3102                  */
3103                 tcp_measure_rcv_mss(sk, skb);
3104                 break;
3105         }
3106
3107         /* tcp_data could move socket to TIME-WAIT */
3108         if (sk->state != TCP_CLOSE) {
3109                 tcp_data_snd_check(sk);
3110                 tcp_ack_snd_check(sk);
3111         }
3112
3113         if (!queued) {
3114 discard:
3115                 kfree_skb(skb);
3116         }
3117         return 0;
3118 }