Import 2.3.18pre1
[davej-history.git] / net / ipv4 / tcp_input.c
blobf0711fccc1b1a36deb7bdecba24437f547a0dc28
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
60 #include <linux/config.h>
61 #include <linux/mm.h>
62 #include <linux/sysctl.h>
63 #include <net/tcp.h>
64 #include <net/inet_common.h>
65 #include <linux/ipsec.h>
67 #ifdef CONFIG_SYSCTL
68 #define SYNC_INIT 0 /* let the user enable it */
69 #else
70 #define SYNC_INIT 1
71 #endif
73 extern int sysctl_tcp_fin_timeout;
74 extern int sysctl_tcp_keepalive_time;
76 /* These are on by default so the code paths get tested.
77 * For the final 2.2 this may be undone at our discretion. -DaveM
79 int sysctl_tcp_timestamps = 1;
80 int sysctl_tcp_window_scaling = 1;
81 int sysctl_tcp_sack = 1;
83 int sysctl_tcp_syncookies = SYNC_INIT;
84 int sysctl_tcp_stdurg;
85 int sysctl_tcp_rfc1337;
86 int sysctl_tcp_tw_recycle;
88 static int prune_queue(struct sock *sk);
90 /* There is something which you must keep in mind when you analyze the
91 * behavior of the tp->ato delayed ack timeout interval. When a
92 * connection starts up, we want to ack as quickly as possible. The
93 * problem is that "good" TCP's do slow start at the beginning of data
94 * transmission. The means that until we send the first few ACK's the
95 * sender will sit on his end and only queue most of his data, because
96 * he can only send snd_cwnd unacked packets at any given time. For
97 * each ACK we send, he increments snd_cwnd and transmits more of his
98 * queue. -DaveM
100 static void tcp_delack_estimator(struct tcp_opt *tp)
102 if(tp->ato == 0) {
103 tp->lrcvtime = tcp_time_stamp;
105 /* Help sender leave slow start quickly,
106 * and also makes sure we do not take this
107 * branch ever again for this connection.
109 tp->ato = 1;
110 tcp_enter_quickack_mode(tp);
111 } else {
112 int m = tcp_time_stamp - tp->lrcvtime;
114 tp->lrcvtime = tcp_time_stamp;
115 if(m <= 0)
116 m = 1;
117 if(m > tp->rto)
118 tp->ato = tp->rto;
119 else {
120 /* This funny shift makes sure we
121 * clear the "quick ack mode" bit.
123 tp->ato = ((tp->ato << 1) >> 2) + m;
129 * Remember to send an ACK later.
131 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
132 struct sk_buff *skb)
134 tp->delayed_acks++;
136 /* Tiny-grams with PSH set artifically deflate our
137 * ato measurement, but with a lower bound.
139 if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
140 /* Preserve the quickack state. */
141 if((tp->ato & 0x7fffffff) > HZ/50)
142 tp->ato = ((tp->ato & 0x80000000) |
143 (HZ/50));
147 /* Called to compute a smoothed rtt estimate. The data fed to this
148 * routine either comes from timestamps, or from segments that were
149 * known _not_ to have been retransmitted [see Karn/Partridge
150 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
151 * piece by Van Jacobson.
152 * NOTE: the next three routines used to be one big routine.
153 * To save cycles in the RFC 1323 implementation it was better to break
154 * it up into three procedures. -- erics
157 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
159 long m = mrtt; /* RTT */
161 /* The following amusing code comes from Jacobson's
162 * article in SIGCOMM '88. Note that rtt and mdev
163 * are scaled versions of rtt and mean deviation.
164 * This is designed to be as fast as possible
165 * m stands for "measurement".
167 * On a 1990 paper the rto value is changed to:
168 * RTO = rtt + 4 * mdev
170 if(m == 0)
171 m = 1;
172 if (tp->srtt != 0) {
173 m -= (tp->srtt >> 3); /* m is now error in rtt est */
174 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
175 if (m < 0)
176 m = -m; /* m is now abs(error) */
177 m -= (tp->mdev >> 2); /* similar update on mdev */
178 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
179 } else {
180 /* no previous measure. */
181 tp->srtt = m<<3; /* take the measured time to be rtt */
182 tp->mdev = m<<2; /* make sure rto = 3*rtt */
186 /* Calculate rto without backoff. This is the second half of Van Jacobson's
187 * routine referred to above.
190 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
192 tp->rto = (tp->srtt >> 3) + tp->mdev;
193 /* I am not enough educated to understand this magic.
194 * However, it smells bad. snd_cwnd>31 is common case.
196 tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
200 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
201 * on packet lifetime in the internet. We need the HZ/5 lower
202 * bound to behave correctly against BSD stacks with a fixed
203 * delayed ack.
204 * FIXME: It's not entirely clear this lower bound is the best
205 * way to avoid the problem. Is it possible to drop the lower
206 * bound and still avoid trouble with BSD stacks? Perhaps
207 * some modification to the RTO calculation that takes delayed
208 * ack bias into account? This needs serious thought. -- erics
210 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
212 if (tp->rto > 120*HZ)
213 tp->rto = 120*HZ;
214 if (tp->rto < HZ/5)
215 tp->rto = HZ/5;
218 /* Save metrics learned by this TCP session.
219 This function is called only, when TCP finishes sucessfully
220 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
222 static void tcp_update_metrics(struct sock *sk)
224 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
225 struct dst_entry *dst = __sk_dst_get(sk);
227 if (dst) {
228 int m;
230 if (tp->backoff || !tp->srtt) {
231 /* This session failed to estimate rtt. Why?
232 * Probably, no packets returned in time.
233 * Reset our results.
235 if (!(dst->mxlock&(1<<RTAX_RTT)))
236 dst->rtt = 0;
237 return;
240 dst_confirm(dst);
242 m = dst->rtt - tp->srtt;
244 /* If newly calculated rtt larger than stored one,
245 * store new one. Otherwise, use EWMA. Remember,
246 * rtt overestimation is always better than underestimation.
248 if (!(dst->mxlock&(1<<RTAX_RTT))) {
249 if (m <= 0)
250 dst->rtt = tp->srtt;
251 else
252 dst->rtt -= (m>>3);
255 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
256 if (m < 0)
257 m = -m;
259 /* Scale deviation to rttvar fixed point */
260 m >>= 1;
261 if (m < tp->mdev)
262 m = tp->mdev;
264 if (m >= dst->rttvar)
265 dst->rttvar = m;
266 else
267 dst->rttvar -= (dst->rttvar - m)>>2;
270 if (tp->snd_ssthresh == 0x7FFFFFFF) {
271 /* Slow start still did not finish. */
272 if (dst->ssthresh &&
273 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
274 tp->snd_cwnd > dst->ssthresh)
275 dst->ssthresh = tp->snd_cwnd;
276 if (!(dst->mxlock&(1<<RTAX_CWND)) &&
277 tp->snd_cwnd > dst->cwnd)
278 dst->cwnd = tp->snd_cwnd;
279 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
280 /* Cong. avoidance phase, cwnd is reliable. */
281 if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
282 dst->ssthresh = tp->snd_cwnd;
283 if (!(dst->mxlock&(1<<RTAX_CWND)))
284 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
285 } else {
286 /* Else slow start did not finish, cwnd is non-sense,
287 ssthresh may be also invalid.
289 if (!(dst->mxlock&(1<<RTAX_CWND)))
290 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
291 if (dst->ssthresh &&
292 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
293 tp->snd_ssthresh > dst->ssthresh)
294 dst->ssthresh = tp->snd_ssthresh;
299 /* Initialize metrics on socket. */
301 static void tcp_init_metrics(struct sock *sk)
303 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
304 struct dst_entry *dst = __sk_dst_get(sk);
306 if (dst == NULL)
307 goto reset;
309 dst_confirm(dst);
311 if (dst->rtt == 0)
312 goto reset;
314 if (!tp->srtt || !tp->saw_tstamp)
315 goto reset;
317 /* Initial rtt is determined from SYN,SYN-ACK.
318 * The segment is small and rtt may appear much
319 * less than real one. Use per-dst memory
320 * to make it more realistic.
322 * A bit of theory. RTT is time passed after "normal" sized packet
323 * is sent until it is ACKed. In normal curcumstances sending small
324 * packets force peer to delay ACKs and calculation is correct too.
325 * The algorithm is adaptive and, provided we follow specs, it
326 * NEVER underestimate RTT. BUT! If peer tries to make some clever
327 * tricks sort of "quick acks" for time long enough to decrease RTT
328 * to low value, and then abruptly stops to do it and starts to delay
329 * ACKs, wait for troubles.
331 if (dst->rtt > tp->srtt)
332 tp->srtt = dst->rtt;
333 if (dst->rttvar > tp->mdev)
334 tp->mdev = dst->rttvar;
335 tcp_set_rto(tp);
336 tcp_bound_rto(tp);
338 if (dst->mxlock&(1<<RTAX_CWND))
339 tp->snd_cwnd_clamp = dst->cwnd;
340 if (dst->ssthresh) {
341 tp->snd_ssthresh = dst->ssthresh;
342 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
343 tp->snd_ssthresh = tp->snd_cwnd_clamp;
345 return;
348 reset:
349 /* Play conservative. If timestamps are not
350 * supported, TCP will fail to recalculate correct
351 * rtt, if initial rto is too small. FORGET ALL AND RESET!
353 if (!tp->saw_tstamp && tp->srtt) {
354 tp->srtt = 0;
355 tp->mdev = TCP_TIMEOUT_INIT;
356 tp->rto = TCP_TIMEOUT_INIT;
360 #define PAWS_24DAYS (60 * 60 * 24 * 24)
363 /* WARNING: this must not be called if tp->saw_tstamp was false. */
364 extern __inline__ void
365 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
367 if (!after(seq, tp->last_ack_sent)) {
368 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
369 * extra check below makes sure this can only happen
370 * for pure ACK frames. -DaveM
372 * Not only, also it occurs for expired timestamps
373 * and RSTs with bad timestamp option. --ANK
376 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
377 xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
378 tp->ts_recent = tp->rcv_tsval;
379 tp->ts_recent_stamp = xtime.tv_sec;
384 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
386 return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
387 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
389 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
391 I cannot see quitely as all the idea behind PAWS
392 is destroyed 8)
394 The problem is only in reordering duplicate ACKs.
395 Hence, we can check this rare case more carefully.
397 1. Check that it is really duplicate ACK (ack==snd_una)
398 2. Give it some small "replay" window (~RTO)
400 We do not know units of foreign ts values, but make conservative
401 assumption that they are >=1ms. It solves problem
402 noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
404 && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
405 TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
406 !skb->h.th->ack ||
407 (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
411 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
413 u32 end_window = tp->rcv_wup + tp->rcv_wnd;
415 if (tp->rcv_wnd &&
416 after(end_seq, tp->rcv_nxt) &&
417 before(seq, end_window))
418 return 1;
419 if (seq != end_window)
420 return 0;
421 return (seq == end_seq);
424 /* This functions checks to see if the tcp header is actually acceptable. */
425 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
427 if (seq == tp->rcv_nxt)
428 return (tp->rcv_wnd || (end_seq == seq));
430 return __tcp_sequence(tp, seq, end_seq);
433 /* When we get a reset we do this. */
434 static void tcp_reset(struct sock *sk)
436 sk->zapped = 1;
438 /* We want the right error as BSD sees it (and indeed as we do). */
439 switch (sk->state) {
440 case TCP_SYN_SENT:
441 sk->err = ECONNREFUSED;
442 break;
443 case TCP_CLOSE_WAIT:
444 sk->err = EPIPE;
445 break;
446 case TCP_CLOSE:
447 return;
448 default:
449 sk->err = ECONNRESET;
451 tcp_set_state(sk, TCP_CLOSE);
452 tcp_clear_xmit_timers(sk);
453 tcp_done(sk);
456 /* This tags the retransmission queue when SACKs arrive. */
457 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
459 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
460 int i = nsacks;
462 while(i--) {
463 struct sk_buff *skb = skb_peek(&sk->write_queue);
464 __u32 start_seq = ntohl(sp->start_seq);
465 __u32 end_seq = ntohl(sp->end_seq);
466 int fack_count = 0;
468 while((skb != NULL) &&
469 (skb != tp->send_head) &&
470 (skb != (struct sk_buff *)&sk->write_queue)) {
471 /* The retransmission queue is always in order, so
472 * we can short-circuit the walk early.
474 if(after(TCP_SKB_CB(skb)->seq, end_seq))
475 break;
477 /* We play conservative, we don't allow SACKS to partially
478 * tag a sequence space.
480 fack_count++;
481 if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
482 !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
483 /* If this was a retransmitted frame, account for it. */
484 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
485 tp->retrans_out)
486 tp->retrans_out--;
487 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
489 /* RULE: All new SACKs will either decrease retrans_out
490 * or advance fackets_out.
492 if(fack_count > tp->fackets_out)
493 tp->fackets_out = fack_count;
495 skb = skb->next;
497 sp++; /* Move on to the next SACK block. */
501 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
502 * But, this can also be called on packets in the established flow when
503 * the fast version below fails.
505 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
507 unsigned char *ptr;
508 int length=(th->doff*4)-sizeof(struct tcphdr);
510 ptr = (unsigned char *)(th + 1);
511 tp->saw_tstamp = 0;
513 while(length>0) {
514 int opcode=*ptr++;
515 int opsize;
517 switch (opcode) {
518 case TCPOPT_EOL:
519 return;
520 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
521 length--;
522 continue;
523 default:
524 opsize=*ptr++;
525 if (opsize < 2) /* "silly options" */
526 return;
527 if (opsize > length)
528 break; /* don't parse partial options */
529 switch(opcode) {
530 case TCPOPT_MSS:
531 if(opsize==TCPOLEN_MSS && th->syn) {
532 u16 in_mss = ntohs(*(__u16 *)ptr);
533 if (in_mss) {
534 if (tp->user_mss && tp->user_mss < in_mss)
535 in_mss = tp->user_mss;
536 tp->mss_clamp = in_mss;
539 break;
540 case TCPOPT_WINDOW:
541 if(opsize==TCPOLEN_WINDOW && th->syn)
542 if (!no_fancy && sysctl_tcp_window_scaling) {
543 tp->wscale_ok = 1;
544 tp->snd_wscale = *(__u8 *)ptr;
545 if(tp->snd_wscale > 14) {
546 if(net_ratelimit())
547 printk("tcp_parse_options: Illegal window "
548 "scaling value %d >14 received.",
549 tp->snd_wscale);
550 tp->snd_wscale = 14;
553 break;
554 case TCPOPT_TIMESTAMP:
555 if(opsize==TCPOLEN_TIMESTAMP) {
556 if (sysctl_tcp_timestamps && !no_fancy) {
557 tp->tstamp_ok = 1;
558 tp->saw_tstamp = 1;
559 tp->rcv_tsval = ntohl(*(__u32 *)ptr);
560 tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
563 break;
564 case TCPOPT_SACK_PERM:
565 if(opsize==TCPOLEN_SACK_PERM && th->syn) {
566 if (sysctl_tcp_sack && !no_fancy) {
567 tp->sack_ok = 1;
568 tp->num_sacks = 0;
571 break;
573 case TCPOPT_SACK:
574 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
575 sysctl_tcp_sack && (sk != NULL) && !th->syn) {
576 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
578 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
579 int num_sacks = sack_bytes >> 3;
580 struct tcp_sack_block *sackp;
582 sackp = (struct tcp_sack_block *)ptr;
583 tcp_sacktag_write_queue(sk, sackp, num_sacks);
587 ptr+=opsize-2;
588 length-=opsize;
593 /* Fast parse options. This hopes to only see timestamps.
594 * If it is wrong it falls back on tcp_parse_options().
596 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
598 /* If we didn't send out any options ignore them all. */
599 if (tp->tcp_header_len == sizeof(struct tcphdr))
600 return 0;
601 if (th->doff == sizeof(struct tcphdr)>>2) {
602 tp->saw_tstamp = 0;
603 return 0;
604 } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
605 __u32 *ptr = (__u32 *)(th + 1);
606 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
607 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
608 tp->saw_tstamp = 1;
609 ++ptr;
610 tp->rcv_tsval = ntohl(*ptr);
611 ++ptr;
612 tp->rcv_tsecr = ntohl(*ptr);
613 return 1;
616 tcp_parse_options(sk, th, tp, 0);
617 return 1;
620 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
621 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
622 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
623 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
624 #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
626 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
628 if (tp->dup_acks > 3)
629 tp->snd_cwnd = (tp->snd_ssthresh);
631 tp->dup_acks = 0;
634 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
635 * retransmit timer fires.
637 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
639 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
641 /* Note: If not_dup is set this implies we got a
642 * data carrying packet or a window update.
643 * This carries no new information about possible
644 * lost packets, so we have to ignore it for the purposes
645 * of counting duplicate acks. Ideally this does not imply we
646 * should stop our fast retransmit phase, more acks may come
647 * later without data to help us. Unfortunately this would make
648 * the code below much more complex. For now if I see such
649 * a packet I clear the fast retransmit phase.
651 if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
652 /* This is the standard reno style fast retransmit branch. */
654 /* 1. When the third duplicate ack is received, set ssthresh
655 * to one half the current congestion window, but no less
656 * than two segments. Retransmit the missing segment.
658 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
659 tp->dup_acks++;
660 if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
661 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
662 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
663 tp->snd_ssthresh = tp->snd_cwnd_clamp;
664 tp->snd_cwnd = (tp->snd_ssthresh + 3);
665 tp->high_seq = tp->snd_nxt;
666 if(!tp->fackets_out)
667 tcp_retransmit_skb(sk,
668 skb_peek(&sk->write_queue));
669 else
670 tcp_fack_retransmit(sk);
671 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
673 } else if (++tp->dup_acks > 3) {
674 /* 2. Each time another duplicate ACK arrives, increment
675 * cwnd by the segment size. [...] Transmit a packet...
677 * Packet transmission will be done on normal flow processing
678 * since we're not in "retransmit mode". We do not use
679 * duplicate ACKs to artificially inflate the congestion
680 * window when doing FACK.
682 if(!tp->fackets_out) {
683 tp->snd_cwnd++;
684 } else {
685 /* Fill any further holes which may have
686 * appeared.
688 * We may want to change this to run every
689 * further multiple-of-3 dup ack increments,
690 * to be more robust against out-of-order
691 * packet delivery. -DaveM
693 tcp_fack_retransmit(sk);
696 } else if (tp->high_seq != 0) {
697 /* In this branch we deal with clearing the Floyd style
698 * block on duplicate fast retransmits, and if requested
699 * we do Hoe style secondary fast retransmits.
701 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
702 /* Once we have acked all the packets up to high_seq
703 * we are done this fast retransmit phase.
704 * Alternatively data arrived. In this case we
705 * Have to abort the fast retransmit attempt.
706 * Note that we do want to accept a window
707 * update since this is expected with Hoe's algorithm.
709 clear_fast_retransmit(tp);
711 /* After we have cleared up to high_seq we can
712 * clear the Floyd style block.
714 if (!before(ack, tp->high_seq)) {
715 tp->high_seq = 0;
716 tp->fackets_out = 0;
718 } else if (tp->dup_acks >= 3) {
719 if (!tp->fackets_out) {
720 /* Hoe Style. We didn't ack the whole
721 * window. Take this as a cue that
722 * another packet was lost and retransmit it.
723 * Don't muck with the congestion window here.
724 * Note that we have to be careful not to
725 * act if this was a window update and it
726 * didn't ack new data, since this does
727 * not indicate a packet left the system.
728 * We can test this by just checking
729 * if ack changed from snd_una, since
730 * the only way to get here without advancing
731 * from snd_una is if this was a window update.
733 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
734 tcp_retransmit_skb(sk,
735 skb_peek(&sk->write_queue));
736 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
738 } else {
739 /* FACK style, fill any remaining holes in
740 * receiver's queue.
742 tcp_fack_retransmit(sk);
748 /* This is Jacobson's slow start and congestion avoidance.
749 * SIGCOMM '88, p. 328.
751 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
753 if (tp->snd_cwnd <= tp->snd_ssthresh) {
754 /* In "safe" area, increase. */
755 tp->snd_cwnd++;
756 } else {
757 /* In dangerous area, increase slowly.
758 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
760 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
761 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
762 tp->snd_cwnd++;
763 tp->snd_cwnd_cnt=0;
764 } else
765 tp->snd_cwnd_cnt++;
769 /* Remove acknowledged frames from the retransmission queue. */
770 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
771 __u32 *seq, __u32 *seq_rtt)
773 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
774 struct sk_buff *skb;
775 __u32 now = tcp_time_stamp;
776 int acked = 0;
778 /* If we are retransmitting, and this ACK clears up to
779 * the retransmit head, or further, then clear our state.
781 if (tp->retrans_head != NULL &&
782 !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
783 tp->retrans_head = NULL;
785 while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
786 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
787 __u8 sacked = scb->sacked;
789 /* If our packet is before the ack sequence we can
790 * discard it as it's confirmed to have arrived at
791 * the other end.
793 if (after(scb->end_seq, ack))
794 break;
796 /* Initial outgoing SYN's get put onto the write_queue
797 * just like anything else we transmit. It is not
798 * true data, and if we misinform our callers that
799 * this ACK acks real data, we will erroneously exit
800 * connection startup slow start one packet too
801 * quickly. This is severely frowned upon behavior.
803 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
804 tp->retrans_out--;
805 if(!(scb->flags & TCPCB_FLAG_SYN)) {
806 acked |= FLAG_DATA_ACKED;
807 if(sacked & TCPCB_SACKED_RETRANS)
808 acked |= FLAG_RETRANS_DATA_ACKED;
809 if(tp->fackets_out)
810 tp->fackets_out--;
811 } else {
812 acked |= FLAG_SYN_ACKED;
813 /* This is pure paranoia. */
814 tp->retrans_head = NULL;
816 tp->packets_out--;
817 *seq = scb->seq;
818 *seq_rtt = now - scb->when;
819 __skb_unlink(skb, skb->list);
820 kfree_skb(skb);
822 return acked;
825 static void tcp_ack_probe(struct sock *sk, __u32 ack)
827 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
829 /* Our probe was answered. */
830 tp->probes_out = 0;
832 /* Was it a usable window open? */
834 /* should always be non-null */
835 if (tp->send_head != NULL &&
836 !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
837 tp->backoff = 0;
838 tp->pending = 0;
839 tcp_clear_xmit_timer(sk, TIME_PROBE0);
840 } else {
841 tcp_reset_xmit_timer(sk, TIME_PROBE0,
842 min(tp->rto << tp->backoff, 120*HZ));
846 /* Should we open up the congestion window? */
847 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
849 /* Data must have been acked. */
850 if ((flag & FLAG_DATA_ACKED) == 0)
851 return 0;
853 /* Some of the data acked was retransmitted somehow? */
854 if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
855 /* We advance in all cases except during
856 * non-FACK fast retransmit/recovery.
858 if (tp->fackets_out != 0 ||
859 tp->retransmits != 0)
860 return 1;
862 /* Non-FACK fast retransmit does it's own
863 * congestion window management, don't get
864 * in the way.
866 return 0;
869 /* New non-retransmitted data acked, always advance. */
870 return 1;
873 /* Read draft-ietf-tcplw-high-performance before mucking
874 * with this code. (Superceeds RFC1323)
876 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
877 u32 seq, u32 ack, int flag)
879 __u32 seq_rtt;
881 /* RTTM Rule: A TSecr value received in a segment is used to
882 * update the averaged RTT measurement only if the segment
883 * acknowledges some new data, i.e., only if it advances the
884 * left edge of the send window.
886 * See draft-ietf-tcplw-high-performance-00, section 3.3.
887 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
889 if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
890 return;
892 seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
893 tcp_rtt_estimator(tp, seq_rtt);
894 if (tp->retransmits) {
895 if (tp->packets_out == 0) {
896 tp->retransmits = 0;
897 tp->fackets_out = 0;
898 tp->retrans_out = 0;
899 tp->backoff = 0;
900 tcp_set_rto(tp);
901 } else {
902 /* Still retransmitting, use backoff */
903 tcp_set_rto(tp);
904 tp->rto = tp->rto << tp->backoff;
906 } else {
907 tcp_set_rto(tp);
910 tcp_bound_rto(tp);
913 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
915 struct sk_buff *skb = skb_peek(&sk->write_queue);
917 /* Some data was ACK'd, if still retransmitting (due to a
918 * timeout), resend more of the retransmit queue. The
919 * congestion window is handled properly by that code.
921 if (tp->retransmits) {
922 tcp_xmit_retransmit_queue(sk);
923 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
924 } else {
925 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
926 if ((__s32)when < 0)
927 when = 1;
928 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
932 /* This routine deals with incoming acks, but not outgoing ones. */
933 static int tcp_ack(struct sock *sk, struct tcphdr *th,
934 u32 ack_seq, u32 ack, int len)
936 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
937 int flag = 0;
938 u32 seq = 0;
939 u32 seq_rtt = 0;
941 if(sk->zapped)
942 return(1); /* Dead, can't ack any more so why bother */
944 if (tp->pending == TIME_KEEPOPEN)
945 tp->probes_out = 0;
947 tp->rcv_tstamp = tcp_time_stamp;
949 /* If the ack is newer than sent or older than previous acks
950 * then we can probably ignore it.
952 if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
953 goto uninteresting_ack;
955 /* If there is data set flag 1 */
956 if (len != th->doff*4) {
957 flag |= FLAG_DATA;
958 tcp_delack_estimator(tp);
961 /* Update our send window. */
963 /* This is the window update code as per RFC 793
964 * snd_wl{1,2} are used to prevent unordered
965 * segments from shrinking the window
967 if (before(tp->snd_wl1, ack_seq) ||
968 (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
969 u32 nwin = ntohs(th->window) << tp->snd_wscale;
971 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
972 flag |= FLAG_WIN_UPDATE;
973 tp->snd_wnd = nwin;
975 tp->snd_wl1 = ack_seq;
976 tp->snd_wl2 = ack;
978 if (nwin > tp->max_window)
979 tp->max_window = nwin;
983 /* We passed data and got it acked, remove any soft error
984 * log. Something worked...
986 sk->err_soft = 0;
988 /* If this ack opens up a zero window, clear backoff. It was
989 * being used to time the probes, and is probably far higher than
990 * it needs to be for normal retransmission.
992 if (tp->pending == TIME_PROBE0)
993 tcp_ack_probe(sk, ack);
995 /* See if we can take anything off of the retransmit queue. */
996 flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
998 /* We must do this here, before code below clears out important
999 * state contained in tp->fackets_out and tp->retransmits. -DaveM
1001 if (should_advance_cwnd(tp, flag))
1002 tcp_cong_avoid(tp);
1004 /* If we have a timestamp, we always do rtt estimates. */
1005 if (tp->saw_tstamp) {
1006 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1007 } else {
1008 /* If we were retransmiting don't count rtt estimate. */
1009 if (tp->retransmits) {
1010 if (tp->packets_out == 0) {
1011 tp->retransmits = 0;
1012 tp->fackets_out = 0;
1013 tp->retrans_out = 0;
1015 } else {
1016 /* We don't have a timestamp. Can only use
1017 * packets that are not retransmitted to determine
1018 * rtt estimates. Also, we must not reset the
1019 * backoff for rto until we get a non-retransmitted
1020 * packet. This allows us to deal with a situation
1021 * where the network delay has increased suddenly.
1022 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1024 if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1025 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1026 tp->backoff = 0;
1027 tcp_rtt_estimator(tp, seq_rtt);
1028 tcp_set_rto(tp);
1029 tcp_bound_rto(tp);
1035 if (tp->packets_out) {
1036 if (flag & FLAG_DATA_ACKED)
1037 tcp_ack_packets_out(sk, tp);
1038 } else {
1039 tcp_clear_xmit_timer(sk, TIME_RETRANS);
1042 flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1043 if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1044 (tp->high_seq != 0)) {
1045 tcp_fast_retrans(sk, ack, flag);
1046 } else {
1047 /* Clear any aborted fast retransmit starts. */
1048 tp->dup_acks = 0;
1050 /* It is not a brain fart, I thought a bit now. 8)
1052 * Forward progress is indicated, if:
1053 * 1. the ack acknowledges new data.
1054 * 2. or the ack is duplicate, but it is caused by new segment
1055 * arrival. This case is filtered by:
1056 * - it contains no data, syn or fin.
1057 * - it does not update window.
1058 * 3. or new SACK. It is difficult to check, so that we ignore it.
1060 * Forward progress is also indicated by arrival new data,
1061 * which was caused by window open from our side. This case is more
1062 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1063 * --ANK (990513)
1065 if (ack != tp->snd_una || (flag == 0 && !th->fin))
1066 dst_confirm(sk->dst_cache);
1068 /* Remember the highest ack received. */
1069 tp->snd_una = ack;
1070 return 1;
1072 uninteresting_ack:
1073 SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1074 return 0;
1077 /* New-style handling of TIME_WAIT sockets. */
1079 /* Must be called only from BH context. */
1080 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1082 struct tcp_ehash_bucket *ehead;
1083 struct tcp_bind_hashbucket *bhead;
1084 struct tcp_bind_bucket *tb;
1086 /* Unlink from established hashes. */
1087 ehead = &tcp_ehash[tw->hashent];
1088 write_lock(&ehead->lock);
1089 if (!tw->pprev) {
1090 write_unlock(&ehead->lock);
1091 return;
1093 if(tw->next)
1094 tw->next->pprev = tw->pprev;
1095 *(tw->pprev) = tw->next;
1096 tw->pprev = NULL;
1097 write_unlock(&ehead->lock);
1099 /* Disassociate with bind bucket. */
1100 bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1101 spin_lock(&bhead->lock);
1102 if ((tb = tw->tb) != NULL) {
1103 if(tw->bind_next)
1104 tw->bind_next->bind_pprev = tw->bind_pprev;
1105 *(tw->bind_pprev) = tw->bind_next;
1106 tw->tb = NULL;
1107 if (tb->owners == NULL) {
1108 if (tb->next)
1109 tb->next->pprev = tb->pprev;
1110 *(tb->pprev) = tb->next;
1111 kmem_cache_free(tcp_bucket_cachep, tb);
1114 spin_unlock(&bhead->lock);
1116 #ifdef INET_REFCNT_DEBUG
1117 if (atomic_read(&tw->refcnt) != 1) {
1118 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1120 #endif
1121 tcp_tw_put(tw);
1124 /* We come here as a special case from the AF specific TCP input processing,
1125 * and the SKB has no owner. Essentially handling this is very simple,
1126 * we just keep silently eating rx'd packets until none show up for the
1127 * entire timeout period. The only special cases are for BSD TIME_WAIT
1128 * reconnects and SYN/RST bits being set in the TCP header.
1132 * * Main purpose of TIME-WAIT state is to close connection gracefully,
1133 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1134 * (and, probably, tail of data) and one or more our ACKs are lost.
1135 * * What is TIME-WAIT timeout? It is associated with maximal packet
1136 * lifetime in the internet, which results in wrong conclusion, that
1137 * it is set to catch "old duplicate segments" wandering out of their path.
1138 * It is not quite correct. This timeout is calculated so that it exceeds
1139 * maximal retransmision timeout enough to allow to lose one (or more)
1140 * segments sent by peer and our ACKs. This time may be calculated from RTO.
1141 * * When TIME-WAIT socket receives RST, it means that another end
1142 * finally closed and we are allowed to kill TIME-WAIT too.
1143 * * Second purpose of TIME-WAIT is catching old duplicate segments.
1144 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
1145 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1146 * * If we invented some more clever way to catch duplicates
1147 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1149 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1150 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1151 * from the very beginning.
1153 enum tcp_tw_status
1154 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1155 struct tcphdr *th, unsigned len)
1157 struct tcp_opt tp;
1158 int paws_reject = 0;
1160 /* RFC 1122:
1161 * "When a connection is [...] on TIME-WAIT state [...]
1162 * [a TCP] MAY accept a new SYN from the remote TCP to
1163 * reopen the connection directly, if it:
1165 * (1) assigns its initial sequence number for the new
1166 * connection to be larger than the largest sequence
1167 * number it used on the previous connection incarnation,
1168 * and
1170 * (2) returns to TIME-WAIT state if the SYN turns out
1171 * to be an old duplicate".
1174 tp.saw_tstamp = 0;
1175 if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1176 tcp_parse_options(NULL, th, &tp, 0);
1178 paws_reject = tp.saw_tstamp &&
1179 ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
1180 xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
1183 if (!paws_reject &&
1184 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1185 TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1186 /* In window segment, it may be only reset or bare ack. */
1188 if (th->rst) {
1189 #ifdef CONFIG_TCP_TW_RECYCLE
1190 /* When recycling, always follow rfc1337,
1191 * but mark bucket as ready to recycling immediately.
1193 if (sysctl_tcp_tw_recycle) {
1194 /* May kill it now. */
1195 tw->rto = 0;
1196 tw->ttd = jiffies;
1197 } else
1198 #endif
1199 /* This is TIME_WAIT assasination, in two flavors.
1200 * Oh well... nobody has a sufficient solution to this
1201 * protocol bug yet.
1203 if(sysctl_tcp_rfc1337 == 0) {
1204 tcp_tw_deschedule(tw);
1205 tcp_timewait_kill(tw);
1207 } else {
1208 tcp_tw_reschedule(tw);
1211 if (tp.saw_tstamp) {
1212 tw->ts_recent = tp.rcv_tsval;
1213 tw->ts_recent_stamp = xtime.tv_sec;
1215 tcp_tw_put(tw);
1216 return TCP_TW_SUCCESS;
1219 /* Out of window segment.
1221 All the segments are ACKed immediately.
1223 The only exception is new SYN. We accept it, if it is
1224 not old duplicate and we are not in danger to be killed
1225 by delayed old duplicates. RFC check is that it has
1226 newer sequence number works at rates <40Mbit/sec.
1227 However, if paws works, it is reliable AND even more,
1228 we even may relax silly seq space cutoff.
1230 RED-PEN: we violate main RFC requirement, if this SYN will appear
1231 old duplicate (i.e. we receive RST in reply to SYN-ACK),
1232 we must return socket to time-wait state. It is not good,
1233 but not fatal yet.
1236 if (th->syn && !th->rst && !th->ack && !paws_reject &&
1237 (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1238 (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
1239 u32 isn = tw->snd_nxt + 2;
1240 if (isn == 0)
1241 isn++;
1242 TCP_SKB_CB(skb)->when = isn;
1243 return TCP_TW_SYN;
1246 if(!th->rst) {
1247 /* In this case we must reset the TIMEWAIT timer.
1249 If it is ACKless SYN it may be both old duplicate
1250 and new good SYN with random sequence number <rcv_nxt.
1251 Do not reschedule in the last case.
1253 if (paws_reject || th->ack) {
1254 tcp_tw_reschedule(tw);
1255 #ifdef CONFIG_TCP_TW_RECYCLE
1256 tw->rto = min(120*HZ, tw->rto<<1);
1257 tw->ttd = jiffies + tw->rto;
1258 #endif
1261 /* Send ACK. Note, we do not put the bucket,
1262 * it will be released by caller.
1264 return TCP_TW_ACK;
1266 tcp_tw_put(tw);
1267 return TCP_TW_SUCCESS;
1270 /* Enter the time wait state. This is always called from BH
1271 * context. Essentially we whip up a timewait bucket, copy the
1272 * relevant info into it from the SK, and mess with hash chains
1273 * and list linkage.
1275 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1277 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1278 struct tcp_bind_hashbucket *bhead;
1279 struct sock **head, *sktw;
1281 write_lock(&ehead->lock);
1283 /* Step 1: Remove SK from established hash. */
1284 if (sk->pprev) {
1285 if(sk->next)
1286 sk->next->pprev = sk->pprev;
1287 *sk->pprev = sk->next;
1288 sk->pprev = NULL;
1291 /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1292 head = &(ehead + tcp_ehash_size)->chain;
1293 sktw = (struct sock *)tw;
1294 if((sktw->next = *head) != NULL)
1295 (*head)->pprev = &sktw->next;
1296 *head = sktw;
1297 sktw->pprev = head;
1298 atomic_inc(&tw->refcnt);
1300 write_unlock(&ehead->lock);
1302 /* Step 3: Put TW into bind hash. Original socket stays there too.
1303 Note, that any socket with sk->num!=0 MUST be bound in binding
1304 cache, even if it is closed.
1306 bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1307 spin_lock(&bhead->lock);
1308 tw->tb = (struct tcp_bind_bucket *)sk->prev;
1309 BUG_TRAP(sk->prev!=NULL);
1310 if ((tw->bind_next = tw->tb->owners) != NULL)
1311 tw->tb->owners->bind_pprev = &tw->bind_next;
1312 tw->tb->owners = (struct sock*)tw;
1313 tw->bind_pprev = &tw->tb->owners;
1314 spin_unlock(&bhead->lock);
1316 /* Step 4: Un-charge protocol socket in-use count. */
1317 sk->prot->inuse--;
1321 * Move a socket to time-wait.
1323 void tcp_time_wait(struct sock *sk)
1325 struct tcp_tw_bucket *tw;
1327 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1328 if(tw != NULL) {
1329 /* Give us an identity. */
1330 tw->daddr = sk->daddr;
1331 tw->rcv_saddr = sk->rcv_saddr;
1332 tw->bound_dev_if= sk->bound_dev_if;
1333 tw->num = sk->num;
1334 tw->state = TCP_TIME_WAIT;
1335 tw->sport = sk->sport;
1336 tw->dport = sk->dport;
1337 tw->family = sk->family;
1338 tw->reuse = sk->reuse;
1339 tw->hashent = sk->hashent;
1340 tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
1341 tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
1342 tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent;
1343 tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
1344 #ifdef CONFIG_TCP_TW_RECYCLE
1345 tw->rto = sk->tp_pinfo.af_tcp.rto;
1346 tw->ttd = jiffies + 2*tw->rto;
1347 #endif
1348 atomic_set(&tw->refcnt, 0);
1350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1351 if(tw->family == PF_INET6) {
1352 memcpy(&tw->v6_daddr,
1353 &sk->net_pinfo.af_inet6.daddr,
1354 sizeof(struct in6_addr));
1355 memcpy(&tw->v6_rcv_saddr,
1356 &sk->net_pinfo.af_inet6.rcv_saddr,
1357 sizeof(struct in6_addr));
1359 #endif
1360 /* Linkage updates. */
1361 __tcp_tw_hashdance(sk, tw);
1363 /* Get the TIME_WAIT timeout firing. */
1364 tcp_tw_schedule(tw);
1366 /* CLOSE the SK. */
1367 if(sk->state == TCP_ESTABLISHED)
1368 tcp_statistics.TcpCurrEstab--;
1369 sk->state = TCP_CLOSE;
1370 } else {
1371 /* Sorry, we're out of memory, just CLOSE this
1372 * socket up. We've got bigger problems than
1373 * non-graceful socket closings.
1375 tcp_set_state(sk, TCP_CLOSE);
1378 tcp_update_metrics(sk);
1379 tcp_clear_xmit_timers(sk);
1380 tcp_done(sk);
1384 * Process the FIN bit. This now behaves as it is supposed to work
1385 * and the FIN takes effect when it is validly part of sequence
1386 * space. Not before when we get holes.
1388 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1389 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1390 * TIME-WAIT)
1392 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1393 * close and we go into CLOSING (and later onto TIME-WAIT)
1395 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1398 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1400 sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1402 tcp_send_ack(sk);
1404 if (!sk->dead) {
1405 wake_up_interruptible(sk->sleep);
1406 sock_wake_async(sk->socket, 1);
1409 switch(sk->state) {
1410 case TCP_SYN_RECV:
1411 case TCP_ESTABLISHED:
1412 /* Move to CLOSE_WAIT */
1413 tcp_set_state(sk, TCP_CLOSE_WAIT);
1414 break;
1416 case TCP_CLOSE_WAIT:
1417 case TCP_CLOSING:
1418 /* Received a retransmission of the FIN, do
1419 * nothing.
1421 break;
1422 case TCP_LAST_ACK:
1423 /* RFC793: Remain in the LAST-ACK state. */
1424 break;
1426 case TCP_FIN_WAIT1:
1427 /* This case occurs when a simultaneous close
1428 * happens, we must ack the received FIN and
1429 * enter the CLOSING state.
1431 tcp_set_state(sk, TCP_CLOSING);
1432 break;
1433 case TCP_FIN_WAIT2:
1434 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1435 tcp_time_wait(sk);
1436 break;
1437 default:
1438 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1439 * cases we should never reach this piece of code.
1441 printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1442 break;
1446 /* These routines update the SACK block as out-of-order packets arrive or
1447 * in-order packets close up the sequence space.
1449 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1451 int this_sack, num_sacks = tp->num_sacks;
1452 struct tcp_sack_block *swalk = &tp->selective_acks[0];
1454 /* If more than one SACK block, see if the recent change to SP eats into
1455 * or hits the sequence space of other SACK blocks, if so coalesce.
1457 if(num_sacks != 1) {
1458 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1459 if(swalk == sp)
1460 continue;
1462 /* First case, bottom of SP moves into top of the
1463 * sequence space of SWALK.
1465 if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1466 sp->start_seq = swalk->start_seq;
1467 goto coalesce;
1469 /* Second case, top of SP moves into bottom of the
1470 * sequence space of SWALK.
1472 if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1473 sp->end_seq = swalk->end_seq;
1474 goto coalesce;
1478 /* SP is the only SACK, or no coalescing cases found. */
1479 return;
1481 coalesce:
1482 /* Zap SWALK, by moving every further SACK up by one slot.
1483 * Decrease num_sacks.
1485 for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1486 struct tcp_sack_block *next = (swalk + 1);
1487 swalk->start_seq = next->start_seq;
1488 swalk->end_seq = next->end_seq;
1490 tp->num_sacks--;
1493 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1495 __u32 tmp;
1497 tmp = sack1->start_seq;
1498 sack1->start_seq = sack2->start_seq;
1499 sack2->start_seq = tmp;
1501 tmp = sack1->end_seq;
1502 sack1->end_seq = sack2->end_seq;
1503 sack2->end_seq = tmp;
1506 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1508 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1509 struct tcp_sack_block *sp = &tp->selective_acks[0];
1510 int cur_sacks = tp->num_sacks;
1512 if (!cur_sacks)
1513 goto new_sack;
1515 /* Optimize for the common case, new ofo frames arrive
1516 * "in order". ;-) This also satisfies the requirements
1517 * of RFC2018 about ordering of SACKs.
1519 if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1520 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1521 tcp_sack_maybe_coalesce(tp, sp);
1522 } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1523 /* Re-ordered arrival, in this case, can be optimized
1524 * as well.
1526 sp->start_seq = TCP_SKB_CB(skb)->seq;
1527 tcp_sack_maybe_coalesce(tp, sp);
1528 } else {
1529 struct tcp_sack_block *swap = sp + 1;
1530 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1532 /* Oh well, we have to move things around.
1533 * Try to find a SACK we can tack this onto.
1536 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1537 if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1538 (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1539 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1540 swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1541 else
1542 swap->start_seq = TCP_SKB_CB(skb)->seq;
1543 tcp_sack_swap(sp, swap);
1544 tcp_sack_maybe_coalesce(tp, sp);
1545 return;
1549 /* Could not find an adjacent existing SACK, build a new one,
1550 * put it at the front, and shift everyone else down. We
1551 * always know there is at least one SACK present already here.
1553 * If the sack array is full, forget about the last one.
1555 if (cur_sacks >= max_sacks) {
1556 cur_sacks--;
1557 tp->num_sacks--;
1559 while(cur_sacks >= 1) {
1560 struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1561 struct tcp_sack_block *prev = (this - 1);
1562 this->start_seq = prev->start_seq;
1563 this->end_seq = prev->end_seq;
1564 cur_sacks--;
1567 new_sack:
1568 /* Build the new head SACK, and we're done. */
1569 sp->start_seq = TCP_SKB_CB(skb)->seq;
1570 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1571 tp->num_sacks++;
1575 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1577 struct tcp_sack_block *sp = &tp->selective_acks[0];
1578 int num_sacks = tp->num_sacks;
1579 int this_sack;
1581 /* This is an in order data segment _or_ an out-of-order SKB being
1582 * moved to the receive queue, so we know this removed SKB will eat
1583 * from the front of a SACK.
1585 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1586 /* Check if the start of the sack is covered by skb. */
1587 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1588 before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1589 break;
1592 /* This should only happen if so many SACKs get built that some get
1593 * pushed out before we get here, or we eat some in sequence packets
1594 * which are before the first SACK block.
1596 if(this_sack >= num_sacks)
1597 return;
1599 sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1600 if(!before(sp->start_seq, sp->end_seq)) {
1601 /* Zap this SACK, by moving forward any other SACKS. */
1602 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1603 struct tcp_sack_block *next = (sp + 1);
1604 sp->start_seq = next->start_seq;
1605 sp->end_seq = next->end_seq;
1607 tp->num_sacks--;
1611 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1613 struct tcp_sack_block *sp = &tp->selective_acks[0];
1614 int num_sacks = tp->num_sacks;
1615 int this_sack;
1617 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1618 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1619 break;
1621 if(this_sack >= num_sacks)
1622 return;
1623 sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1626 /* This one checks to see if we can put data from the
1627 * out_of_order queue into the receive_queue.
1629 static void tcp_ofo_queue(struct sock *sk)
1631 struct sk_buff *skb;
1632 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1634 while ((skb = skb_peek(&tp->out_of_order_queue))) {
1635 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1636 break;
1638 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1639 SOCK_DEBUG(sk, "ofo packet was already received \n");
1640 __skb_unlink(skb, skb->list);
1641 kfree_skb(skb);
1642 continue;
1644 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1645 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1646 TCP_SKB_CB(skb)->end_seq);
1648 if(tp->sack_ok)
1649 tcp_sack_remove_skb(tp, skb);
1650 __skb_unlink(skb, skb->list);
1651 __skb_queue_tail(&sk->receive_queue, skb);
1652 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1653 if(skb->h.th->fin)
1654 tcp_fin(skb, sk, skb->h.th);
1658 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1660 struct sk_buff *skb1;
1661 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1663 /* Queue data for delivery to the user.
1664 * Packets in sequence go to the receive queue.
1665 * Out of sequence packets to the out_of_order_queue.
1667 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1668 /* Ok. In sequence. */
1669 queue_and_out:
1670 dst_confirm(sk->dst_cache);
1671 __skb_queue_tail(&sk->receive_queue, skb);
1672 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1673 if(skb->h.th->fin) {
1674 tcp_fin(skb, sk, skb->h.th);
1675 } else {
1676 tcp_remember_ack(tp, skb->h.th, skb);
1678 /* This may have eaten into a SACK block. */
1679 if(tp->sack_ok && tp->num_sacks)
1680 tcp_sack_remove_skb(tp, skb);
1681 tcp_ofo_queue(sk);
1683 /* Turn on fast path. */
1684 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1685 tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1686 ntohl(TCP_FLAG_ACK) |
1687 tp->snd_wnd);
1688 return;
1691 /* An old packet, either a retransmit or some packet got lost. */
1692 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1693 /* A retransmit, 2nd most common case. Force an imediate ack. */
1694 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1695 tcp_enter_quickack_mode(tp);
1696 kfree_skb(skb);
1697 return;
1700 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1701 /* Partial packet, seq < rcv_next < end_seq */
1702 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1703 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1704 TCP_SKB_CB(skb)->end_seq);
1706 goto queue_and_out;
1709 /* Ok. This is an out_of_order segment, force an ack. */
1710 tp->delayed_acks++;
1711 tcp_enter_quickack_mode(tp);
1713 /* Disable header prediction. */
1714 tp->pred_flags = 0;
1716 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1717 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1719 if (skb_peek(&tp->out_of_order_queue) == NULL) {
1720 /* Initial out of order segment, build 1 SACK. */
1721 if(tp->sack_ok) {
1722 tp->num_sacks = 1;
1723 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1724 tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1726 __skb_queue_head(&tp->out_of_order_queue,skb);
1727 } else {
1728 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1729 /* Already there. */
1730 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1731 if (skb->len >= skb1->len) {
1732 if(tp->sack_ok)
1733 tcp_sack_extend(tp, skb1, skb);
1734 __skb_append(skb1, skb);
1735 __skb_unlink(skb1, skb1->list);
1736 kfree_skb(skb1);
1737 } else {
1738 /* A duplicate, smaller than what is in the
1739 * out-of-order queue right now, toss it.
1741 kfree_skb(skb);
1743 break;
1746 if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1747 __skb_append(skb1, skb);
1748 if(tp->sack_ok)
1749 tcp_sack_new_ofo_skb(sk, skb);
1750 break;
1753 /* See if we've hit the start. If so insert. */
1754 if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1755 __skb_queue_head(&tp->out_of_order_queue,skb);
1756 if(tp->sack_ok)
1757 tcp_sack_new_ofo_skb(sk, skb);
1758 break;
1766 * This routine handles the data. If there is room in the buffer,
1767 * it will be have already been moved into it. If there is no
1768 * room, then we will just have to discard the packet.
1771 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1773 struct tcphdr *th;
1774 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1776 th = skb->h.th;
1777 skb_pull(skb, th->doff*4);
1778 skb_trim(skb, len - (th->doff*4));
1780 if (skb->len == 0 && !th->fin)
1781 return(0);
1784 * If our receive queue has grown past its limits shrink it.
1785 * Make sure to do this before moving snd_nxt, otherwise
1786 * data might be acked for that we don't have enough room.
1788 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1789 if (prune_queue(sk) < 0) {
1790 /* Still not enough room. That can happen when
1791 * skb->true_size differs significantly from skb->len.
1793 return 0;
1797 tcp_data_queue(sk, skb);
1799 if (before(tp->rcv_nxt, tp->copied_seq)) {
1800 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1801 tp->rcv_nxt = tp->copied_seq;
1804 /* Above, tcp_data_queue() increments delayed_acks appropriately.
1805 * Now tell the user we may have some data.
1807 if (!sk->dead) {
1808 wake_up_interruptible(sk->sleep);
1809 sock_wake_async(sk->socket,1);
1811 return(1);
1814 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1816 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1818 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1819 tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1820 /* Put more data onto the wire. */
1821 tcp_write_xmit(sk);
1822 } else if (tp->packets_out == 0 && !tp->pending) {
1823 /* Start probing the receivers window. */
1824 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1828 static __inline__ void tcp_data_snd_check(struct sock *sk)
1830 struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1832 if (skb != NULL)
1833 __tcp_data_snd_check(sk, skb);
1837 * Adapt the MSS value used to make delayed ack decision to the
1838 * real world.
1840 * The constant 536 hasn't any good meaning. In IPv4 world
1841 * MTU may be smaller, though it contradicts to RFC1122, which
1842 * states that MSS must be at least 536.
1843 * We use the constant to do not ACK each second
1844 * packet in a stream of tiny size packets.
1845 * It means that super-low mtu links will be aggressively delacked.
1846 * Seems, it is even good. If they have so low mtu, they are weirdly
1847 * slow.
1849 * AK: BTW it may be useful to add an option to lock the rcv_mss.
1850 * this way the beowulf people wouldn't need ugly patches to get the
1851 * ack frequencies they want and it would be an elegant way to tune delack.
1853 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1855 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1856 unsigned int len, lss;
1858 lss = tp->last_seg_size;
1859 tp->last_seg_size = 0;
1861 /* skb->len may jitter because of SACKs, even if peer
1862 * sends good full-sized frames.
1864 len = skb->len;
1865 if (len >= tp->rcv_mss) {
1866 tp->rcv_mss = len;
1867 } else {
1868 /* Otherwise, we make more careful check taking into account,
1869 * that SACKs block is variable.
1871 * "len" is invariant segment length, including TCP header.
1873 len = skb->tail - skb->h.raw;
1874 if (len >= 536 + sizeof(struct tcphdr)) {
1875 /* Subtract also invariant (if peer is RFC compliant),
1876 * tcp header plus fixed timestamp option length.
1877 * Resulting "len" is MSS free of SACK jitter.
1879 len -= tp->tcp_header_len;
1880 if (len == lss)
1881 tp->rcv_mss = len;
1882 tp->last_seg_size = len;
1888 * Check if sending an ack is needed.
1890 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
1892 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1894 /* This also takes care of updating the window.
1895 * This if statement needs to be simplified.
1897 * Rules for delaying an ack:
1898 * - delay time <= 0.5 HZ
1899 * - we don't have a window update to send
1900 * - must send at least every 2 full sized packets
1901 * - must send an ACK if we have any out of order data
1903 * With an extra heuristic to handle loss of packet
1904 * situations and also helping the sender leave slow
1905 * start in an expediant manner.
1908 /* Two full frames received or... */
1909 if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1910 /* We will update the window "significantly" or... */
1911 tcp_raise_window(sk) ||
1912 /* We entered "quick ACK" mode or... */
1913 tcp_in_quickack_mode(tp) ||
1914 /* We have out of order data */
1915 (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
1916 /* Then ack it now */
1917 tcp_send_ack(sk);
1918 } else {
1919 /* Else, send delayed ack. */
1920 tcp_send_delayed_ack(sk, HZ/2);
1924 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1926 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1927 if (tp->delayed_acks == 0) {
1928 /* We sent a data segment already. */
1929 return;
1931 __tcp_ack_snd_check(sk, 1);
1936 * This routine is only called when we have urgent data
1937 * signalled. Its the 'slow' part of tcp_urg. It could be
1938 * moved inline now as tcp_urg is only called from one
1939 * place. We handle URGent data wrong. We have to - as
1940 * BSD still doesn't use the correction from RFC961.
1941 * For 1003.1g we should support a new option TCP_STDURG to permit
1942 * either form (or just set the sysctl tcp_stdurg).
1945 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1947 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1948 u32 ptr = ntohs(th->urg_ptr);
1950 if (ptr && !sysctl_tcp_stdurg)
1951 ptr--;
1952 ptr += ntohl(th->seq);
1954 /* Ignore urgent data that we've already seen and read. */
1955 if (after(tp->copied_seq, ptr))
1956 return;
1958 /* Do we already have a newer (or duplicate) urgent pointer? */
1959 if (tp->urg_data && !after(ptr, tp->urg_seq))
1960 return;
1962 /* Tell the world about our new urgent pointer. */
1963 if (sk->proc != 0) {
1964 if (sk->proc > 0)
1965 kill_proc(sk->proc, SIGURG, 1);
1966 else
1967 kill_pg(-sk->proc, SIGURG, 1);
1970 /* We may be adding urgent data when the last byte read was
1971 * urgent. To do this requires some care. We cannot just ignore
1972 * tp->copied_seq since we would read the last urgent byte again
1973 * as data, nor can we alter copied_seq until this data arrives
1974 * or we break the sematics of SIOCATMARK (and thus sockatmark())
1976 if (tp->urg_seq == tp->copied_seq)
1977 tp->copied_seq++; /* Move the copied sequence on correctly */
1978 tp->urg_data = URG_NOTYET;
1979 tp->urg_seq = ptr;
1981 /* Disable header prediction. */
1982 tp->pred_flags = 0;
1985 /* This is the 'fast' part of urgent handling. */
1986 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1988 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1990 /* Check if we get a new urgent pointer - normally not. */
1991 if (th->urg)
1992 tcp_check_urg(sk,th);
1994 /* Do we wait for any urgent data? - normally not... */
1995 if (tp->urg_data == URG_NOTYET) {
1996 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1998 /* Is the urgent pointer pointing into this packet? */
1999 if (ptr < len) {
2000 tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
2001 if (!sk->dead)
2002 sk->data_ready(sk,0);
2007 /* Clean the out_of_order queue if we can, trying to get
2008 * the socket within its memory limits again.
2010 * Return less than zero if we should start dropping frames
2011 * until the socket owning process reads some of the data
2012 * to stabilize the situation.
2014 static int prune_queue(struct sock *sk)
2016 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2017 struct sk_buff * skb;
2019 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2021 net_statistics.PruneCalled++;
2023 /* First, purge the out_of_order queue. */
2024 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2025 if(skb != NULL) {
2026 /* Free it all. */
2027 do { net_statistics.OfoPruned += skb->len;
2028 kfree_skb(skb);
2029 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2030 } while(skb != NULL);
2032 /* Reset SACK state. A conforming SACK implementation will
2033 * do the same at a timeout based retransmit. When a connection
2034 * is in a sad state like this, we care only about integrity
2035 * of the connection not performance.
2037 if(tp->sack_ok)
2038 tp->num_sacks = 0;
2041 /* If we are really being abused, tell the caller to silently
2042 * drop receive data on the floor. It will get retransmitted
2043 * and hopefully then we'll have sufficient space.
2045 * We used to try to purge the in-order packets too, but that
2046 * turns out to be deadly and fraught with races. Consider:
2048 * 1) If we acked the data, we absolutely cannot drop the
2049 * packet. This data would then never be retransmitted.
2050 * 2) It is possible, with a proper sequence of events involving
2051 * delayed acks and backlog queue handling, to have the user
2052 * read the data before it gets acked. The previous code
2053 * here got this wrong, and it lead to data corruption.
2054 * 3) Too much state changes happen when the FIN arrives, so once
2055 * we've seen that we can't remove any in-order data safely.
2057 * The net result is that removing in-order receive data is too
2058 * complex for anyones sanity. So we don't do it anymore. But
2059 * if we are really having our buffer space abused we stop accepting
2060 * new receive data.
2062 * FIXME: it should recompute SACK state and only remove enough
2063 * buffers to get into bounds again. The current scheme loses
2064 * badly sometimes on links with large RTT, especially when
2065 * the driver has high overhead per skb.
2066 * (increasing the rcvbuf is not enough because it inflates the
2067 * the window too, disabling flow control effectively) -AK
2069 if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2070 return 0;
2072 /* Massive buffer overcommit. */
2073 return -1;
2077 * TCP receive function for the ESTABLISHED state.
2079 * It is split into a fast path and a slow path. The fast path is
2080 * disabled when:
2081 * - A zero window was announced from us - zero window probing
2082 * is only handled properly in the slow path.
2083 * - Out of order segments arrived.
2084 * - Urgent data is expected.
2085 * - There is no buffer space left
2086 * - Unexpected TCP flags/window values/header lengths are received
2087 * (detected by checking the TCP header against pred_flags)
2088 * - Data is sent in both directions. Fast path only supports pure senders
2089 * or pure receivers (this means either the sequence number or the ack
2090 * value must stay constant)
2091 * - Unexpected TCP option.
2093 * When these conditions are not satisfied it drops into a standard
2094 * receive procedure patterned after RFC793 to handle all cases.
2095 * The first three cases are guaranteed by proper pred_flags setting,
2096 * the rest is checked inline. Fast processing is turned on in
2097 * tcp_data_queue when everything is OK.
2099 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2100 struct tcphdr *th, unsigned len)
2102 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2105 * Header prediction.
2106 * The code losely follows the one in the famous
2107 * "30 instruction TCP receive" Van Jacobson mail.
2109 * Van's trick is to deposit buffers into socket queue
2110 * on a device interrupt, to call tcp_recv function
2111 * on the receive process context and checksum and copy
2112 * the buffer to user space. smart...
2114 * Our current scheme is not silly either but we take the
2115 * extra cost of the net_bh soft interrupt processing...
2116 * We do checksum and copy also but from device to kernel.
2120 /* RED-PEN. Using static variables to pass function arguments
2121 * cannot be good idea...
2123 tp->saw_tstamp = 0;
2125 /* pred_flags is 0xS?10 << 16 + snd_wnd
2126 * if header_predition is to be made
2127 * 'S' will always be tp->tcp_header_len >> 2
2128 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2129 * turn it off (when there are holes in the receive
2130 * space for instance)
2131 * PSH flag is ignored.
2134 if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2135 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2136 int tcp_header_len = th->doff*4;
2138 /* Timestamp header prediction */
2140 /* Non-standard header f.e. SACKs -> slow path */
2141 if (tcp_header_len != tp->tcp_header_len)
2142 goto slow_path;
2144 /* Check timestamp */
2145 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2146 __u32 *ptr = (__u32 *)(th + 1);
2148 /* No? Slow path! */
2149 if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2150 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2151 goto slow_path;
2153 tp->saw_tstamp = 1;
2154 ++ptr;
2155 tp->rcv_tsval = ntohl(*ptr);
2156 ++ptr;
2157 tp->rcv_tsecr = ntohl(*ptr);
2159 /* If PAWS failed, check it more carefully in slow path */
2160 if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2161 goto slow_path;
2163 /* Predicted packet is in window by definition.
2164 seq == rcv_nxt and last_ack_sent <= rcv_nxt.
2165 Hence, check seq<=last_ack_sent reduces to:
2167 if (tp->rcv_nxt == tp->last_ack_sent) {
2168 tp->ts_recent = tp->rcv_tsval;
2169 tp->ts_recent_stamp = xtime.tv_sec;
2173 if (len <= tcp_header_len) {
2174 /* Bulk data transfer: sender */
2175 if (len == tcp_header_len) {
2176 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2177 TCP_SKB_CB(skb)->ack_seq, len);
2178 kfree_skb(skb);
2179 tcp_data_snd_check(sk);
2180 return 0;
2181 } else { /* Header too small */
2182 tcp_statistics.TcpInErrs++;
2183 goto discard;
2185 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
2186 atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
2187 /* Bulk data transfer: receiver */
2188 __skb_pull(skb,tcp_header_len);
2190 /* Is it possible to simplify this? */
2191 tcp_measure_rcv_mss(sk, skb);
2193 /* DO NOT notify forward progress here.
2194 * It saves dozen of CPU instructions in fast path. --ANK
2195 * And where is it signaled then ? -AK
2197 __skb_queue_tail(&sk->receive_queue, skb);
2198 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2200 /* FIN bit check is not done since if FIN is set in
2201 * this frame, the pred_flags won't match up. -DaveM
2203 wake_up_interruptible(sk->sleep);
2204 sock_wake_async(sk->socket,1);
2205 tcp_delack_estimator(tp);
2207 tcp_remember_ack(tp, th, skb);
2209 __tcp_ack_snd_check(sk, 0);
2210 return 0;
2212 /* Packet is in sequence, flags are trivial;
2213 * only ACK is strange or we are tough on memory.
2214 * Jump to step 5.
2216 goto step5;
2219 slow_path:
2221 * RFC1323: H1. Apply PAWS check first.
2223 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2224 tcp_paws_discard(tp, skb)) {
2225 if (!th->rst) {
2226 tcp_send_ack(sk);
2227 goto discard;
2229 /* Resets are accepted even if PAWS failed.
2231 ts_recent update must be made after we are sure
2232 that the packet is in window.
2237 * Standard slow path.
2240 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2241 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2242 * (RST) segments are validated by checking their SEQ-fields."
2243 * And page 69: "If an incoming segment is not acceptable,
2244 * an acknowledgment should be sent in reply (unless the RST bit
2245 * is set, if so drop the segment and return)".
2247 if (th->rst)
2248 goto discard;
2249 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2250 SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2251 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2252 tp->rcv_wup, tp->rcv_wnd);
2254 tcp_send_ack(sk);
2255 goto discard;
2258 if(th->rst) {
2259 tcp_reset(sk);
2260 goto discard;
2263 if (tp->saw_tstamp) {
2264 tcp_replace_ts_recent(sk, tp,
2265 TCP_SKB_CB(skb)->seq);
2268 if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2269 SOCK_DEBUG(sk, "syn in established state\n");
2270 tcp_statistics.TcpInErrs++;
2271 tcp_reset(sk);
2272 return 1;
2275 step5:
2276 if(th->ack)
2277 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2279 /* Process urgent data. */
2280 tcp_urg(sk, th, len);
2283 /* step 7: process the segment text */
2284 int queued = tcp_data(skb, sk, len);
2286 tcp_measure_rcv_mss(sk, skb);
2288 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2289 if(sk->state != TCP_CLOSE) {
2290 tcp_data_snd_check(sk);
2291 tcp_ack_snd_check(sk);
2294 if (!queued) {
2295 discard:
2296 kfree_skb(skb);
2300 return 0;
2304 /* This is not only more efficient than what we used to do, it eliminates
2305 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2307 * Actually, we could lots of memory writes here. tp of listening
2308 * socket contains all necessary default parameters.
2310 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2312 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2314 if(newsk != NULL) {
2315 struct tcp_opt *newtp;
2316 #ifdef CONFIG_FILTER
2317 struct sk_filter *filter;
2318 #endif
2320 memcpy(newsk, sk, sizeof(*newsk));
2321 newsk->state = TCP_SYN_RECV;
2323 /* SANITY */
2324 newsk->pprev = NULL;
2325 newsk->prev = NULL;
2327 /* Clone the TCP header template */
2328 newsk->dport = req->rmt_port;
2330 sock_lock_init(newsk);
2332 atomic_set(&newsk->rmem_alloc, 0);
2333 skb_queue_head_init(&newsk->receive_queue);
2334 atomic_set(&newsk->wmem_alloc, 0);
2335 skb_queue_head_init(&newsk->write_queue);
2336 atomic_set(&newsk->omem_alloc, 0);
2338 newsk->done = 0;
2339 newsk->proc = 0;
2340 newsk->backlog.head = newsk->backlog.tail = NULL;
2341 skb_queue_head_init(&newsk->error_queue);
2342 newsk->write_space = tcp_write_space;
2343 #ifdef CONFIG_FILTER
2344 if ((filter = newsk->filter) != NULL)
2345 sk_filter_charge(newsk, filter);
2346 #endif
2348 /* Now setup tcp_opt */
2349 newtp = &(newsk->tp_pinfo.af_tcp);
2350 newtp->pred_flags = 0;
2351 newtp->rcv_nxt = req->rcv_isn + 1;
2352 newtp->snd_nxt = req->snt_isn + 1;
2353 newtp->snd_una = req->snt_isn + 1;
2354 newtp->srtt = 0;
2355 newtp->ato = 0;
2356 newtp->snd_wl1 = req->rcv_isn;
2357 newtp->snd_wl2 = req->snt_isn;
2359 /* RFC1323: The window in SYN & SYN/ACK segments
2360 * is never scaled.
2362 newtp->snd_wnd = ntohs(skb->h.th->window);
2364 newtp->max_window = newtp->snd_wnd;
2365 newtp->pending = 0;
2366 newtp->retransmits = 0;
2367 newtp->last_ack_sent = req->rcv_isn + 1;
2368 newtp->backoff = 0;
2369 newtp->mdev = TCP_TIMEOUT_INIT;
2371 /* So many TCP implementations out there (incorrectly) count the
2372 * initial SYN frame in their delayed-ACK and congestion control
2373 * algorithms that we must have the following bandaid to talk
2374 * efficiently to them. -DaveM
2376 newtp->snd_cwnd = 2;
2378 newtp->rto = TCP_TIMEOUT_INIT;
2379 newtp->packets_out = 0;
2380 newtp->fackets_out = 0;
2381 newtp->retrans_out = 0;
2382 newtp->high_seq = 0;
2383 newtp->snd_ssthresh = 0x7fffffff;
2384 newtp->snd_cwnd_cnt = 0;
2385 newtp->dup_acks = 0;
2386 newtp->delayed_acks = 0;
2387 init_timer(&newtp->retransmit_timer);
2388 newtp->retransmit_timer.function = &tcp_retransmit_timer;
2389 newtp->retransmit_timer.data = (unsigned long) newsk;
2390 init_timer(&newtp->delack_timer);
2391 newtp->delack_timer.function = &tcp_delack_timer;
2392 newtp->delack_timer.data = (unsigned long) newsk;
2393 skb_queue_head_init(&newtp->out_of_order_queue);
2394 newtp->send_head = newtp->retrans_head = NULL;
2395 newtp->rcv_wup = req->rcv_isn + 1;
2396 newtp->write_seq = req->snt_isn + 1;
2397 newtp->copied_seq = req->rcv_isn + 1;
2399 newtp->saw_tstamp = 0;
2401 init_timer(&newtp->probe_timer);
2402 newtp->probe_timer.function = &tcp_probe_timer;
2403 newtp->probe_timer.data = (unsigned long) newsk;
2404 newtp->probes_out = 0;
2405 newtp->syn_seq = req->rcv_isn;
2406 newtp->fin_seq = req->rcv_isn;
2407 newtp->urg_data = 0;
2408 tcp_synq_init(newtp);
2409 newtp->syn_backlog = 0;
2410 if (skb->len >= 536)
2411 newtp->last_seg_size = skb->len;
2413 /* Back to base struct sock members. */
2414 newsk->err = 0;
2415 newsk->ack_backlog = 0;
2416 newsk->max_ack_backlog = SOMAXCONN;
2417 newsk->priority = 0;
2418 atomic_set(&newsk->refcnt, 1);
2419 atomic_inc(&inet_sock_nr);
2421 spin_lock_init(&sk->timer_lock);
2422 init_timer(&newsk->timer);
2423 newsk->timer.function = &tcp_keepalive_timer;
2424 newsk->timer.data = (unsigned long) newsk;
2425 if (newsk->keepopen)
2426 tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2427 newsk->socket = NULL;
2428 newsk->sleep = NULL;
2430 newtp->tstamp_ok = req->tstamp_ok;
2431 if((newtp->sack_ok = req->sack_ok) != 0)
2432 newtp->num_sacks = 0;
2433 newtp->window_clamp = req->window_clamp;
2434 newtp->rcv_wnd = req->rcv_wnd;
2435 newtp->wscale_ok = req->wscale_ok;
2436 if (newtp->wscale_ok) {
2437 newtp->snd_wscale = req->snd_wscale;
2438 newtp->rcv_wscale = req->rcv_wscale;
2439 } else {
2440 newtp->snd_wscale = newtp->rcv_wscale = 0;
2441 newtp->window_clamp = min(newtp->window_clamp,65535);
2443 if (newtp->tstamp_ok) {
2444 newtp->ts_recent = req->ts_recent;
2445 newtp->ts_recent_stamp = xtime.tv_sec;
2446 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2447 } else {
2448 newtp->ts_recent_stamp = 0;
2449 newtp->tcp_header_len = sizeof(struct tcphdr);
2451 newtp->mss_clamp = req->mss;
2453 return newsk;
2456 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
2458 if (seq == s_win)
2459 return 1;
2460 if (after(end_seq, s_win) && before(seq, e_win))
2461 return 1;
2462 return (seq == e_win && seq == end_seq);
2467 * Process an incoming packet for SYN_RECV sockets represented
2468 * as an open_request.
2471 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2472 struct open_request *req,
2473 struct open_request *prev)
2475 struct tcphdr *th = skb->h.th;
2476 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2477 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2478 int paws_reject = 0;
2479 struct tcp_opt ttp;
2481 /* If socket has already been created, process
2482 packet in its context.
2484 We fall here only due to race, when packets were enqueued
2485 to backlog of listening socket.
2487 if (req->sk)
2488 return req->sk;
2490 ttp.saw_tstamp = 0;
2491 if (th->doff > (sizeof(struct tcphdr)>>2)) {
2493 tcp_parse_options(NULL, th, &ttp, 0);
2495 paws_reject = ttp.saw_tstamp &&
2496 (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
2499 /* Check for pure retransmited SYN. */
2500 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2501 flg == TCP_FLAG_SYN &&
2502 !paws_reject) {
2504 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2505 * this case on figure 6 and figure 8, but formal
2506 * protocol description says NOTHING.
2507 * To be more exact, it says that we should send ACK,
2508 * because this segment (at least, if it has no data)
2509 * is out of window.
2511 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2512 * describe SYN-RECV state. All the description
2513 * is wrong, we cannot believe to it and should
2514 * rely only on common sense and implementation
2515 * experience.
2517 * Enforce "SYN-ACK" according to figure 8, figure 6
2518 * of RFC793, fixed by RFC1122.
2520 req->class->rtx_syn_ack(sk, req);
2521 return NULL;
2524 /* Further reproduces section "SEGMENT ARRIVES"
2525 for state SYN-RECEIVED of RFC793.
2526 It is broken, however, it does not work only
2527 when SYNs are crossed, which is impossible in our
2528 case.
2530 But generally, we should (RFC lies!) to accept ACK
2531 from SYNACK both here and in tcp_rcv_state_process().
2532 tcp_rcv_state_process() does not, hence, we do not too.
2534 Note that the case is absolutely generic:
2535 we cannot optimize anything here without
2536 violating protocol. All the checks must be made
2537 before attempt to create socket.
2540 /* RFC793: "first check sequence number". */
2542 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2543 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2544 /* Out of window: send ACK and drop. */
2545 if (!(flg & TCP_FLAG_RST))
2546 req->class->send_ack(skb, req);
2547 return NULL;
2550 /* In sequence, PAWS is OK. */
2552 if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2553 req->ts_recent = ttp.rcv_tsval;
2555 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2556 /* Truncate SYN, it is out of window starting
2557 at req->rcv_isn+1. */
2558 flg &= ~TCP_FLAG_SYN;
2561 /* RFC793: "second check the RST bit" and
2562 * "fourth, check the SYN bit"
2564 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2565 goto embryonic_reset;
2567 /* RFC793: "fifth check the ACK field" */
2569 if (!(flg & TCP_FLAG_ACK))
2570 return NULL;
2572 /* Invalid ACK: reset will be sent by listening socket */
2573 if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2574 return sk;
2576 /* OK, ACK is valid, create big socket and
2577 feed this segment to it. It will repeat all
2578 the tests. THIS SEGMENT MUST MOVE SOCKET TO
2579 ESTABLISHED STATE. If it will be dropped after
2580 socket is created, wait for troubles.
2582 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2583 if (sk == NULL)
2584 return NULL;
2586 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2587 req->sk = sk;
2588 return sk;
2590 embryonic_reset:
2591 tcp_synq_unlink(tp, req, prev);
2592 tp->syn_backlog--;
2593 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2595 net_statistics.EmbryonicRsts++;
2596 if (!(flg & TCP_FLAG_RST))
2597 req->class->send_reset(skb);
2599 req->class->destructor(req);
2600 tcp_openreq_free(req);
2601 return NULL;
2604 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
2605 struct tcphdr *th, unsigned len)
2607 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2609 tcp_parse_options(sk, th, tp, 0);
2611 #ifdef CONFIG_TCP_TW_RECYCLE
2612 if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
2613 (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
2614 xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
2615 /* Old duplicate segment. We remember last
2616 ts_recent from this host in timewait bucket.
2618 Actually, we could implement per host cache
2619 to truncate timewait state after RTO. Paranoidal arguments
2620 of rfc1337 are not enough to close this nice possibility.
2622 if (net_ratelimit())
2623 printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
2624 if (th->ack)
2625 return 1;
2626 goto discard;
2628 #endif
2630 if (th->ack) {
2631 /* rfc793:
2632 * "If the state is SYN-SENT then
2633 * first check the ACK bit
2634 * If the ACK bit is set
2635 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2636 * a reset (unless the RST bit is set, if so drop
2637 * the segment and return)"
2639 * I cite this place to emphasize one essential
2640 * detail, this check is different of one
2641 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2642 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2643 * because we have no previous data sent before SYN.
2644 * --ANK(990513)
2646 * We do not send data with SYN, so that RFC-correct
2647 * test reduces to:
2649 if (sk->zapped ||
2650 TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2651 return 1;
2653 /* Now ACK is acceptable.
2655 * "If the RST bit is set
2656 * If the ACK was acceptable then signal the user "error:
2657 * connection reset", drop the segment, enter CLOSED state,
2658 * delete TCB, and return."
2661 if (th->rst) {
2662 tcp_reset(sk);
2663 goto discard;
2666 /* rfc793:
2667 * "fifth, if neither of the SYN or RST bits is set then
2668 * drop the segment and return."
2670 * See note below!
2671 * --ANK(990513)
2673 if (!th->syn)
2674 goto discard;
2676 /* rfc793:
2677 * "If the SYN bit is on ...
2678 * are acceptable then ...
2679 * (our SYN has been ACKed), change the connection
2680 * state to ESTABLISHED..."
2682 * Do you see? SYN-less ACKs in SYN-SENT state are
2683 * completely ignored.
2685 * The bug causing stalled SYN-SENT sockets
2686 * was here: tcp_ack advanced snd_una and canceled
2687 * retransmit timer, so that bare ACK received
2688 * in SYN-SENT state (even with invalid ack==ISS,
2689 * because tcp_ack check is too weak for SYN-SENT)
2690 * causes moving socket to invalid semi-SYN-SENT,
2691 * semi-ESTABLISHED state and connection hangs.
2693 * There exist buggy stacks, which really send
2694 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2695 * Actually, if this host did not try to get something
2696 * from ftp.inr.ac.ru I'd never find this bug 8)
2698 * --ANK (990514)
2700 * I was wrong, I apologize. Bare ACK is valid.
2701 * Actually, RFC793 requires to send such ACK
2702 * in reply to any out of window packet.
2703 * It is wrong, but Linux also does it sometimes.
2704 * --ANK (990724)
2707 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2708 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2709 TCP_SKB_CB(skb)->ack_seq, len);
2711 /* Ok.. it's good. Set up sequence numbers and
2712 * move to established.
2714 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2715 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2717 /* RFC1323: The window in SYN & SYN/ACK segments is
2718 * never scaled.
2720 tp->snd_wnd = htons(th->window);
2721 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2722 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2723 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2725 tcp_set_state(sk, TCP_ESTABLISHED);
2727 if (tp->wscale_ok == 0) {
2728 tp->snd_wscale = tp->rcv_wscale = 0;
2729 tp->window_clamp = min(tp->window_clamp,65535);
2732 if (tp->tstamp_ok) {
2733 tp->tcp_header_len =
2734 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2735 } else
2736 tp->tcp_header_len = sizeof(struct tcphdr);
2737 if (tp->saw_tstamp) {
2738 tp->ts_recent = tp->rcv_tsval;
2739 tp->ts_recent_stamp = xtime.tv_sec;
2741 tcp_sync_mss(sk, tp->pmtu_cookie);
2742 tcp_initialize_rcv_mss(sk);
2743 tcp_init_metrics(sk);
2745 if (tp->write_pending) {
2746 /* Save one ACK. Data will be ready after
2747 * several ticks, if write_pending is set.
2749 * How to make this correctly?
2751 tp->delayed_acks++;
2752 if (tp->ato == 0)
2753 tp->ato = tp->rto;
2754 tcp_send_delayed_ack(sk, tp->rto);
2755 } else {
2756 tcp_send_ack(sk);
2759 tp->copied_seq = tp->rcv_nxt;
2761 if(!sk->dead) {
2762 wake_up_interruptible(sk->sleep);
2763 sock_wake_async(sk->socket, 0);
2765 return -1;
2768 /* No ACK in the segment */
2770 if (th->rst) {
2771 /* rfc793:
2772 * "If the RST bit is set
2774 * Otherwise (no ACK) drop the segment and return."
2777 goto discard;
2780 if (th->syn) {
2781 /* We see SYN without ACK. It is attempt of
2782 * simultaneous connect with crossed SYNs.
2784 * The previous version of the code
2785 * checked for "connecting to self"
2786 * here. that check is done now in
2787 * tcp_connect.
2789 * RED-PEN: BTW, it does not. 8)
2791 tcp_set_state(sk, TCP_SYN_RECV);
2792 if (tp->saw_tstamp) {
2793 tp->ts_recent = tp->rcv_tsval;
2794 tp->ts_recent_stamp = xtime.tv_sec;
2797 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2798 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2800 /* RFC1323: The window in SYN & SYN/ACK segments is
2801 * never scaled.
2803 tp->snd_wnd = htons(th->window);
2804 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2806 tcp_sync_mss(sk, tp->pmtu_cookie);
2807 tcp_initialize_rcv_mss(sk);
2809 tcp_send_synack(sk);
2810 #if 0
2811 /* Note, we could accept data and URG from this segment.
2812 * There are no obstacles to make this.
2814 * However, if we ignore data in ACKless segments sometimes,
2815 * we have no reasons to accept it sometimes.
2816 * Also, seems the code doing it in step6 of tcp_rcv_state_process
2817 * is not flawless. So, discard packet for sanity.
2818 * Uncomment this return to process the data.
2820 return -1;
2821 #endif
2823 /* "fifth, if neither of the SYN or RST bits is set then
2824 * drop the segment and return."
2827 discard:
2828 kfree_skb(skb);
2829 return 0;
2834 * This function implements the receiving procedure of RFC 793 for
2835 * all states except ESTABLISHED and TIME_WAIT.
2836 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2837 * address independent.
2840 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2841 struct tcphdr *th, unsigned len)
2843 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2844 int queued = 0;
2846 tp->saw_tstamp = 0;
2848 switch (sk->state) {
2849 case TCP_CLOSE:
2850 /* When state == CLOSED, hash lookup always fails.
2852 * But, there is a back door, the backlog queue.
2853 * If we have a sequence of packets in the backlog
2854 * during __release_sock() which have a sequence such
2855 * that:
2856 * packet X causes entry to TCP_CLOSE state
2857 * ...
2858 * packet X + N has FIN bit set
2860 * We report a (luckily) harmless error in this case.
2861 * The issue is that backlog queue processing bypasses
2862 * any hash lookups (we know which socket packets are for).
2863 * The correct behavior here is what 2.0.x did, since
2864 * a TCP_CLOSE socket does not exist. Drop the frame
2865 * and send a RST back to the other end.
2868 /* 1. The socket may be moved to TIME-WAIT state.
2869 2. While this socket was locked, another socket
2870 with the same identity could be created.
2871 3. To continue?
2873 CONCLUSION: discard and only discard!
2875 Alternative would be relookup and recurse into tcp_v?_rcv
2876 (not *_do_rcv) to work with timewait and listen states
2877 correctly.
2879 goto discard;
2881 case TCP_LISTEN:
2882 if(th->ack)
2883 return 1;
2885 if(th->syn) {
2886 if(tp->af_specific->conn_request(sk, skb) < 0)
2887 return 1;
2889 /* Now we have several options: In theory there is
2890 * nothing else in the frame. KA9Q has an option to
2891 * send data with the syn, BSD accepts data with the
2892 * syn up to the [to be] advertised window and
2893 * Solaris 2.1 gives you a protocol error. For now
2894 * we just ignore it, that fits the spec precisely
2895 * and avoids incompatibilities. It would be nice in
2896 * future to drop through and process the data.
2898 * Now that TTCP is starting to be used we ought to
2899 * queue this data.
2900 * But, this leaves one open to an easy denial of
2901 * service attack, and SYN cookies can't defend
2902 * against this problem. So, we drop the data
2903 * in the interest of security over speed.
2905 goto discard;
2907 goto discard;
2909 case TCP_SYN_SENT:
2910 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
2911 if (queued >= 0)
2912 return queued;
2913 queued = 0;
2914 goto step6;
2917 /* Parse the tcp_options present on this header.
2918 * By this point we really only expect timestamps.
2919 * Note that this really has to be here and not later for PAWS
2920 * (RFC1323) to work.
2922 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2923 tcp_paws_discard(tp, skb)) {
2924 if (!th->rst) {
2925 tcp_send_ack(sk);
2926 goto discard;
2928 /* Reset is accepted even if it did not pass PAWS. */
2931 /* The silly FIN test here is necessary to see an advancing ACK in
2932 * retransmitted FIN frames properly. Consider the following sequence:
2934 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
2935 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
2936 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
2937 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
2939 * At this point the connection will deadlock with host1 believing
2940 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2941 * forever. The following fix is from Taral (taral@taral.net).
2943 * RED-PEN. Seems, the above is not true.
2944 * If at least one end is RFC compliant, it will send ACK to
2945 * out of window FIN and, hence, move peer to TIME-WAIT.
2946 * I comment out this line. --ANK
2948 * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
2949 * received in SYN-RECV. The problem is that description of
2950 * segment processing in SYN-RECV state in RFC792 is WRONG.
2951 * Correct check would accept ACK from this SYN-ACK, see
2952 * figures 6 and 8 (fixed by RFC1122). Compare this
2953 * to problem with FIN, they smell similarly. --ANK
2956 /* step 1: check sequence number */
2957 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
2958 #if 0
2959 && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
2960 #endif
2962 if (!th->rst) {
2963 tcp_send_ack(sk);
2965 goto discard;
2968 /* step 2: check RST bit */
2969 if(th->rst) {
2970 tcp_reset(sk);
2971 goto discard;
2974 if (tp->saw_tstamp) {
2975 tcp_replace_ts_recent(sk, tp,
2976 TCP_SKB_CB(skb)->seq);
2979 /* step 3: check security and precedence [ignored] */
2981 /* step 4:
2983 * Check for a SYN, and ensure it matches the SYN we were
2984 * first sent. We have to handle the rather unusual (but valid)
2985 * sequence that KA9Q derived products may generate of
2987 * SYN
2988 * SYN|ACK Data
2989 * ACK (lost)
2990 * SYN|ACK Data + More Data
2991 * .. we must ACK not RST...
2993 * We keep syn_seq as the sequence space occupied by the
2994 * original syn.
2997 if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2998 tcp_reset(sk);
2999 return 1;
3002 /* step 5: check the ACK field */
3003 if (th->ack) {
3004 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3005 TCP_SKB_CB(skb)->ack_seq, len);
3007 switch(sk->state) {
3008 case TCP_SYN_RECV:
3009 if (acceptable) {
3010 tcp_set_state(sk, TCP_ESTABLISHED);
3011 tp->copied_seq = tp->rcv_nxt;
3013 /* Note, that this wakeup is only for marginal
3014 crossed SYN case. Passively open sockets
3015 are not waked up, because sk->sleep == NULL
3016 and sk->socket == NULL.
3018 if (!sk->dead && sk->sleep) {
3019 wake_up_interruptible(sk->sleep);
3020 sock_wake_async(sk->socket, 1);
3023 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3024 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
3025 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3026 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3028 /* tcp_ack considers this ACK as duplicate
3029 * and does not calculate rtt. It is wrong.
3030 * Fix it at least with timestamps.
3032 if (tp->saw_tstamp && !tp->srtt)
3033 tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3035 tcp_init_metrics(sk);
3036 } else {
3037 SOCK_DEBUG(sk, "bad ack\n");
3038 return 1;
3040 break;
3042 case TCP_FIN_WAIT1:
3043 if (tp->snd_una == tp->write_seq) {
3044 sk->shutdown |= SEND_SHUTDOWN;
3045 tcp_set_state(sk, TCP_FIN_WAIT2);
3046 if (!sk->dead)
3047 sk->state_change(sk);
3048 else
3049 tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
3050 dst_confirm(sk->dst_cache);
3052 break;
3054 case TCP_CLOSING:
3055 if (tp->snd_una == tp->write_seq) {
3056 tcp_time_wait(sk);
3057 goto discard;
3059 break;
3061 case TCP_LAST_ACK:
3062 if (tp->snd_una == tp->write_seq) {
3063 tcp_set_state(sk,TCP_CLOSE);
3064 tcp_update_metrics(sk);
3065 tcp_done(sk);
3066 goto discard;
3068 break;
3070 } else
3071 goto discard;
3073 step6:
3074 /* step 6: check the URG bit */
3075 tcp_urg(sk, th, len);
3077 /* step 7: process the segment text */
3078 switch (sk->state) {
3079 case TCP_CLOSE_WAIT:
3080 case TCP_CLOSING:
3081 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3082 break;
3084 case TCP_FIN_WAIT1:
3085 case TCP_FIN_WAIT2:
3086 /* RFC 793 says to queue data in these states,
3087 * RFC 1122 says we MUST send a reset.
3088 * BSD 4.4 also does reset.
3090 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
3091 if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3092 tcp_reset(sk);
3093 return 1;
3097 case TCP_ESTABLISHED:
3098 queued = tcp_data(skb, sk, len);
3100 /* This must be after tcp_data() does the skb_pull() to
3101 * remove the header size from skb->len.
3103 tcp_measure_rcv_mss(sk, skb);
3104 break;
3107 /* tcp_data could move socket to TIME-WAIT */
3108 if (sk->state != TCP_CLOSE) {
3109 tcp_data_snd_check(sk);
3110 tcp_ack_snd_check(sk);
3113 if (!queued) {
3114 discard:
3115 kfree_skb(skb);
3117 return 0;