2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
25 * Pedro Roque : Fast Retransmit/Recovery.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
60 #include <linux/config.h>
62 #include <linux/sysctl.h>
64 #include <net/inet_common.h>
65 #include <linux/ipsec.h>
68 #define SYNC_INIT 0 /* let the user enable it */
73 extern int sysctl_tcp_fin_timeout
;
74 extern int sysctl_tcp_keepalive_time
;
76 /* These are on by default so the code paths get tested.
77 * For the final 2.2 this may be undone at our discretion. -DaveM
79 int sysctl_tcp_timestamps
= 1;
80 int sysctl_tcp_window_scaling
= 1;
81 int sysctl_tcp_sack
= 1;
83 int sysctl_tcp_syncookies
= SYNC_INIT
;
84 int sysctl_tcp_stdurg
;
85 int sysctl_tcp_rfc1337
;
86 int sysctl_tcp_tw_recycle
;
88 static int prune_queue(struct sock
*sk
);
90 /* There is something which you must keep in mind when you analyze the
91 * behavior of the tp->ato delayed ack timeout interval. When a
92 * connection starts up, we want to ack as quickly as possible. The
93 * problem is that "good" TCP's do slow start at the beginning of data
94 * transmission. The means that until we send the first few ACK's the
95 * sender will sit on his end and only queue most of his data, because
96 * he can only send snd_cwnd unacked packets at any given time. For
97 * each ACK we send, he increments snd_cwnd and transmits more of his
100 static void tcp_delack_estimator(struct tcp_opt
*tp
)
103 tp
->lrcvtime
= tcp_time_stamp
;
105 /* Help sender leave slow start quickly,
106 * and also makes sure we do not take this
107 * branch ever again for this connection.
110 tcp_enter_quickack_mode(tp
);
112 int m
= tcp_time_stamp
- tp
->lrcvtime
;
114 tp
->lrcvtime
= tcp_time_stamp
;
120 /* This funny shift makes sure we
121 * clear the "quick ack mode" bit.
123 tp
->ato
= ((tp
->ato
<< 1) >> 2) + m
;
129 * Remember to send an ACK later.
131 static __inline__
void tcp_remember_ack(struct tcp_opt
*tp
, struct tcphdr
*th
,
136 /* Tiny-grams with PSH set artifically deflate our
137 * ato measurement, but with a lower bound.
139 if(th
->psh
&& (skb
->len
< (tp
->rcv_mss
>> 1))) {
140 /* Preserve the quickack state. */
141 if((tp
->ato
& 0x7fffffff) > HZ
/50)
142 tp
->ato
= ((tp
->ato
& 0x80000000) |
147 /* Called to compute a smoothed rtt estimate. The data fed to this
148 * routine either comes from timestamps, or from segments that were
149 * known _not_ to have been retransmitted [see Karn/Partridge
150 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
151 * piece by Van Jacobson.
152 * NOTE: the next three routines used to be one big routine.
153 * To save cycles in the RFC 1323 implementation it was better to break
154 * it up into three procedures. -- erics
157 static __inline__
void tcp_rtt_estimator(struct tcp_opt
*tp
, __u32 mrtt
)
159 long m
= mrtt
; /* RTT */
161 /* The following amusing code comes from Jacobson's
162 * article in SIGCOMM '88. Note that rtt and mdev
163 * are scaled versions of rtt and mean deviation.
164 * This is designed to be as fast as possible
165 * m stands for "measurement".
167 * On a 1990 paper the rto value is changed to:
168 * RTO = rtt + 4 * mdev
173 m
-= (tp
->srtt
>> 3); /* m is now error in rtt est */
174 tp
->srtt
+= m
; /* rtt = 7/8 rtt + 1/8 new */
176 m
= -m
; /* m is now abs(error) */
177 m
-= (tp
->mdev
>> 2); /* similar update on mdev */
178 tp
->mdev
+= m
; /* mdev = 3/4 mdev + 1/4 new */
180 /* no previous measure. */
181 tp
->srtt
= m
<<3; /* take the measured time to be rtt */
182 tp
->mdev
= m
<<2; /* make sure rto = 3*rtt */
186 /* Calculate rto without backoff. This is the second half of Van Jacobson's
187 * routine referred to above.
190 static __inline__
void tcp_set_rto(struct tcp_opt
*tp
)
192 tp
->rto
= (tp
->srtt
>> 3) + tp
->mdev
;
193 /* I am not enough educated to understand this magic.
194 * However, it smells bad. snd_cwnd>31 is common case.
196 tp
->rto
+= (tp
->rto
>> 2) + (tp
->rto
>> (tp
->snd_cwnd
-1));
200 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
201 * on packet lifetime in the internet. We need the HZ/5 lower
202 * bound to behave correctly against BSD stacks with a fixed
204 * FIXME: It's not entirely clear this lower bound is the best
205 * way to avoid the problem. Is it possible to drop the lower
206 * bound and still avoid trouble with BSD stacks? Perhaps
207 * some modification to the RTO calculation that takes delayed
208 * ack bias into account? This needs serious thought. -- erics
210 static __inline__
void tcp_bound_rto(struct tcp_opt
*tp
)
212 if (tp
->rto
> 120*HZ
)
218 /* Save metrics learned by this TCP session.
219 This function is called only, when TCP finishes sucessfully
220 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
222 static void tcp_update_metrics(struct sock
*sk
)
224 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
225 struct dst_entry
*dst
= __sk_dst_get(sk
);
230 if (tp
->backoff
|| !tp
->srtt
) {
231 /* This session failed to estimate rtt. Why?
232 * Probably, no packets returned in time.
235 if (!(dst
->mxlock
&(1<<RTAX_RTT
)))
242 m
= dst
->rtt
- tp
->srtt
;
244 /* If newly calculated rtt larger than stored one,
245 * store new one. Otherwise, use EWMA. Remember,
246 * rtt overestimation is always better than underestimation.
248 if (!(dst
->mxlock
&(1<<RTAX_RTT
))) {
255 if (!(dst
->mxlock
&(1<<RTAX_RTTVAR
))) {
259 /* Scale deviation to rttvar fixed point */
264 if (m
>= dst
->rttvar
)
267 dst
->rttvar
-= (dst
->rttvar
- m
)>>2;
270 if (tp
->snd_ssthresh
== 0x7FFFFFFF) {
271 /* Slow start still did not finish. */
273 !(dst
->mxlock
&(1<<RTAX_SSTHRESH
)) &&
274 tp
->snd_cwnd
> dst
->ssthresh
)
275 dst
->ssthresh
= tp
->snd_cwnd
;
276 if (!(dst
->mxlock
&(1<<RTAX_CWND
)) &&
277 tp
->snd_cwnd
> dst
->cwnd
)
278 dst
->cwnd
= tp
->snd_cwnd
;
279 } else if (tp
->snd_cwnd
>= tp
->snd_ssthresh
&& !tp
->high_seq
) {
280 /* Cong. avoidance phase, cwnd is reliable. */
281 if (!(dst
->mxlock
&(1<<RTAX_SSTHRESH
)))
282 dst
->ssthresh
= tp
->snd_cwnd
;
283 if (!(dst
->mxlock
&(1<<RTAX_CWND
)))
284 dst
->cwnd
= (dst
->cwnd
+ tp
->snd_cwnd
)>>1;
286 /* Else slow start did not finish, cwnd is non-sense,
287 ssthresh may be also invalid.
289 if (!(dst
->mxlock
&(1<<RTAX_CWND
)))
290 dst
->cwnd
= (dst
->cwnd
+ tp
->snd_ssthresh
)>>1;
292 !(dst
->mxlock
&(1<<RTAX_SSTHRESH
)) &&
293 tp
->snd_ssthresh
> dst
->ssthresh
)
294 dst
->ssthresh
= tp
->snd_ssthresh
;
299 /* Initialize metrics on socket. */
301 static void tcp_init_metrics(struct sock
*sk
)
303 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
304 struct dst_entry
*dst
= __sk_dst_get(sk
);
314 if (!tp
->srtt
|| !tp
->saw_tstamp
)
317 /* Initial rtt is determined from SYN,SYN-ACK.
318 * The segment is small and rtt may appear much
319 * less than real one. Use per-dst memory
320 * to make it more realistic.
322 * A bit of theory. RTT is time passed after "normal" sized packet
323 * is sent until it is ACKed. In normal curcumstances sending small
324 * packets force peer to delay ACKs and calculation is correct too.
325 * The algorithm is adaptive and, provided we follow specs, it
326 * NEVER underestimate RTT. BUT! If peer tries to make some clever
327 * tricks sort of "quick acks" for time long enough to decrease RTT
328 * to low value, and then abruptly stops to do it and starts to delay
329 * ACKs, wait for troubles.
331 if (dst
->rtt
> tp
->srtt
)
333 if (dst
->rttvar
> tp
->mdev
)
334 tp
->mdev
= dst
->rttvar
;
338 if (dst
->mxlock
&(1<<RTAX_CWND
))
339 tp
->snd_cwnd_clamp
= dst
->cwnd
;
341 tp
->snd_ssthresh
= dst
->ssthresh
;
342 if (tp
->snd_ssthresh
> tp
->snd_cwnd_clamp
)
343 tp
->snd_ssthresh
= tp
->snd_cwnd_clamp
;
349 /* Play conservative. If timestamps are not
350 * supported, TCP will fail to recalculate correct
351 * rtt, if initial rto is too small. FORGET ALL AND RESET!
353 if (!tp
->saw_tstamp
&& tp
->srtt
) {
355 tp
->mdev
= TCP_TIMEOUT_INIT
;
356 tp
->rto
= TCP_TIMEOUT_INIT
;
360 #define PAWS_24DAYS (60 * 60 * 24 * 24)
363 /* WARNING: this must not be called if tp->saw_tstamp was false. */
364 extern __inline__
void
365 tcp_replace_ts_recent(struct sock
*sk
, struct tcp_opt
*tp
, u32 seq
)
367 if (!after(seq
, tp
->last_ack_sent
)) {
368 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
369 * extra check below makes sure this can only happen
370 * for pure ACK frames. -DaveM
372 * Not only, also it occurs for expired timestamps
373 * and RSTs with bad timestamp option. --ANK
376 if((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) >= 0 ||
377 xtime
.tv_sec
>= tp
->ts_recent_stamp
+ PAWS_24DAYS
) {
378 tp
->ts_recent
= tp
->rcv_tsval
;
379 tp
->ts_recent_stamp
= xtime
.tv_sec
;
384 extern __inline__
int tcp_paws_discard(struct tcp_opt
*tp
, struct sk_buff
*skb
)
386 return ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) < 0 &&
387 xtime
.tv_sec
< tp
->ts_recent_stamp
+ PAWS_24DAYS
389 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
391 I cannot see quitely as all the idea behind PAWS
394 The problem is only in reordering duplicate ACKs.
395 Hence, we can check this rare case more carefully.
397 1. Check that it is really duplicate ACK (ack==snd_una)
398 2. Give it some small "replay" window (~RTO)
400 We do not know units of foreign ts values, but make conservative
401 assumption that they are >=1ms. It solves problem
402 noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
404 && (TCP_SKB_CB(skb
)->seq
!= TCP_SKB_CB(skb
)->end_seq
||
405 TCP_SKB_CB(skb
)->ack_seq
!= tp
->snd_una
||
407 (s32
)(tp
->ts_recent
- tp
->rcv_tsval
) > (tp
->rto
*1024)/HZ
));
411 static int __tcp_sequence(struct tcp_opt
*tp
, u32 seq
, u32 end_seq
)
413 u32 end_window
= tp
->rcv_wup
+ tp
->rcv_wnd
;
416 after(end_seq
, tp
->rcv_nxt
) &&
417 before(seq
, end_window
))
419 if (seq
!= end_window
)
421 return (seq
== end_seq
);
424 /* This functions checks to see if the tcp header is actually acceptable. */
425 extern __inline__
int tcp_sequence(struct tcp_opt
*tp
, u32 seq
, u32 end_seq
)
427 if (seq
== tp
->rcv_nxt
)
428 return (tp
->rcv_wnd
|| (end_seq
== seq
));
430 return __tcp_sequence(tp
, seq
, end_seq
);
433 /* When we get a reset we do this. */
434 static void tcp_reset(struct sock
*sk
)
438 /* We want the right error as BSD sees it (and indeed as we do). */
441 sk
->err
= ECONNREFUSED
;
449 sk
->err
= ECONNRESET
;
451 tcp_set_state(sk
, TCP_CLOSE
);
452 tcp_clear_xmit_timers(sk
);
456 /* This tags the retransmission queue when SACKs arrive. */
457 static void tcp_sacktag_write_queue(struct sock
*sk
, struct tcp_sack_block
*sp
, int nsacks
)
459 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
463 struct sk_buff
*skb
= skb_peek(&sk
->write_queue
);
464 __u32 start_seq
= ntohl(sp
->start_seq
);
465 __u32 end_seq
= ntohl(sp
->end_seq
);
468 while((skb
!= NULL
) &&
469 (skb
!= tp
->send_head
) &&
470 (skb
!= (struct sk_buff
*)&sk
->write_queue
)) {
471 /* The retransmission queue is always in order, so
472 * we can short-circuit the walk early.
474 if(after(TCP_SKB_CB(skb
)->seq
, end_seq
))
477 /* We play conservative, we don't allow SACKS to partially
478 * tag a sequence space.
481 if(!after(start_seq
, TCP_SKB_CB(skb
)->seq
) &&
482 !before(end_seq
, TCP_SKB_CB(skb
)->end_seq
)) {
483 /* If this was a retransmitted frame, account for it. */
484 if((TCP_SKB_CB(skb
)->sacked
& TCPCB_SACKED_RETRANS
) &&
487 TCP_SKB_CB(skb
)->sacked
|= TCPCB_SACKED_ACKED
;
489 /* RULE: All new SACKs will either decrease retrans_out
490 * or advance fackets_out.
492 if(fack_count
> tp
->fackets_out
)
493 tp
->fackets_out
= fack_count
;
497 sp
++; /* Move on to the next SACK block. */
501 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
502 * But, this can also be called on packets in the established flow when
503 * the fast version below fails.
505 void tcp_parse_options(struct sock
*sk
, struct tcphdr
*th
, struct tcp_opt
*tp
, int no_fancy
)
508 int length
=(th
->doff
*4)-sizeof(struct tcphdr
);
510 ptr
= (unsigned char *)(th
+ 1);
520 case TCPOPT_NOP
: /* Ref: RFC 793 section 3.1 */
525 if (opsize
< 2) /* "silly options" */
528 break; /* don't parse partial options */
531 if(opsize
==TCPOLEN_MSS
&& th
->syn
) {
532 u16 in_mss
= ntohs(*(__u16
*)ptr
);
534 if (tp
->user_mss
&& tp
->user_mss
< in_mss
)
535 in_mss
= tp
->user_mss
;
536 tp
->mss_clamp
= in_mss
;
541 if(opsize
==TCPOLEN_WINDOW
&& th
->syn
)
542 if (!no_fancy
&& sysctl_tcp_window_scaling
) {
544 tp
->snd_wscale
= *(__u8
*)ptr
;
545 if(tp
->snd_wscale
> 14) {
547 printk("tcp_parse_options: Illegal window "
548 "scaling value %d >14 received.",
554 case TCPOPT_TIMESTAMP
:
555 if(opsize
==TCPOLEN_TIMESTAMP
) {
556 if (sysctl_tcp_timestamps
&& !no_fancy
) {
559 tp
->rcv_tsval
= ntohl(*(__u32
*)ptr
);
560 tp
->rcv_tsecr
= ntohl(*(__u32
*)(ptr
+4));
564 case TCPOPT_SACK_PERM
:
565 if(opsize
==TCPOLEN_SACK_PERM
&& th
->syn
) {
566 if (sysctl_tcp_sack
&& !no_fancy
) {
574 if((opsize
>= (TCPOLEN_SACK_BASE
+ TCPOLEN_SACK_PERBLOCK
)) &&
575 sysctl_tcp_sack
&& (sk
!= NULL
) && !th
->syn
) {
576 int sack_bytes
= opsize
- TCPOLEN_SACK_BASE
;
578 if(!(sack_bytes
% TCPOLEN_SACK_PERBLOCK
)) {
579 int num_sacks
= sack_bytes
>> 3;
580 struct tcp_sack_block
*sackp
;
582 sackp
= (struct tcp_sack_block
*)ptr
;
583 tcp_sacktag_write_queue(sk
, sackp
, num_sacks
);
593 /* Fast parse options. This hopes to only see timestamps.
594 * If it is wrong it falls back on tcp_parse_options().
596 static __inline__
int tcp_fast_parse_options(struct sock
*sk
, struct tcphdr
*th
, struct tcp_opt
*tp
)
598 /* If we didn't send out any options ignore them all. */
599 if (tp
->tcp_header_len
== sizeof(struct tcphdr
))
601 if (th
->doff
== sizeof(struct tcphdr
)>>2) {
604 } else if (th
->doff
== (sizeof(struct tcphdr
)>>2)+(TCPOLEN_TSTAMP_ALIGNED
>>2)) {
605 __u32
*ptr
= (__u32
*)(th
+ 1);
606 if (*ptr
== __constant_ntohl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
607 | (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
)) {
610 tp
->rcv_tsval
= ntohl(*ptr
);
612 tp
->rcv_tsecr
= ntohl(*ptr
);
616 tcp_parse_options(sk
, th
, tp
, 0);
620 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
621 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
622 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
623 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
624 #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
626 static __inline__
void clear_fast_retransmit(struct tcp_opt
*tp
)
628 if (tp
->dup_acks
> 3)
629 tp
->snd_cwnd
= (tp
->snd_ssthresh
);
634 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
635 * retransmit timer fires.
637 static void tcp_fast_retrans(struct sock
*sk
, u32 ack
, int not_dup
)
639 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
641 /* Note: If not_dup is set this implies we got a
642 * data carrying packet or a window update.
643 * This carries no new information about possible
644 * lost packets, so we have to ignore it for the purposes
645 * of counting duplicate acks. Ideally this does not imply we
646 * should stop our fast retransmit phase, more acks may come
647 * later without data to help us. Unfortunately this would make
648 * the code below much more complex. For now if I see such
649 * a packet I clear the fast retransmit phase.
651 if (ack
== tp
->snd_una
&& tp
->packets_out
&& (not_dup
== 0)) {
652 /* This is the standard reno style fast retransmit branch. */
654 /* 1. When the third duplicate ack is received, set ssthresh
655 * to one half the current congestion window, but no less
656 * than two segments. Retransmit the missing segment.
658 if (tp
->high_seq
== 0 || after(ack
, tp
->high_seq
)) {
660 if ((tp
->fackets_out
> 3) || (tp
->dup_acks
== 3)) {
661 tp
->snd_ssthresh
= tcp_recalc_ssthresh(tp
);
662 if (tp
->snd_ssthresh
> tp
->snd_cwnd_clamp
)
663 tp
->snd_ssthresh
= tp
->snd_cwnd_clamp
;
664 tp
->snd_cwnd
= (tp
->snd_ssthresh
+ 3);
665 tp
->high_seq
= tp
->snd_nxt
;
667 tcp_retransmit_skb(sk
,
668 skb_peek(&sk
->write_queue
));
670 tcp_fack_retransmit(sk
);
671 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
);
673 } else if (++tp
->dup_acks
> 3) {
674 /* 2. Each time another duplicate ACK arrives, increment
675 * cwnd by the segment size. [...] Transmit a packet...
677 * Packet transmission will be done on normal flow processing
678 * since we're not in "retransmit mode". We do not use
679 * duplicate ACKs to artificially inflate the congestion
680 * window when doing FACK.
682 if(!tp
->fackets_out
) {
685 /* Fill any further holes which may have
688 * We may want to change this to run every
689 * further multiple-of-3 dup ack increments,
690 * to be more robust against out-of-order
691 * packet delivery. -DaveM
693 tcp_fack_retransmit(sk
);
696 } else if (tp
->high_seq
!= 0) {
697 /* In this branch we deal with clearing the Floyd style
698 * block on duplicate fast retransmits, and if requested
699 * we do Hoe style secondary fast retransmits.
701 if (!before(ack
, tp
->high_seq
) || (not_dup
& FLAG_DATA
) != 0) {
702 /* Once we have acked all the packets up to high_seq
703 * we are done this fast retransmit phase.
704 * Alternatively data arrived. In this case we
705 * Have to abort the fast retransmit attempt.
706 * Note that we do want to accept a window
707 * update since this is expected with Hoe's algorithm.
709 clear_fast_retransmit(tp
);
711 /* After we have cleared up to high_seq we can
712 * clear the Floyd style block.
714 if (!before(ack
, tp
->high_seq
)) {
718 } else if (tp
->dup_acks
>= 3) {
719 if (!tp
->fackets_out
) {
720 /* Hoe Style. We didn't ack the whole
721 * window. Take this as a cue that
722 * another packet was lost and retransmit it.
723 * Don't muck with the congestion window here.
724 * Note that we have to be careful not to
725 * act if this was a window update and it
726 * didn't ack new data, since this does
727 * not indicate a packet left the system.
728 * We can test this by just checking
729 * if ack changed from snd_una, since
730 * the only way to get here without advancing
731 * from snd_una is if this was a window update.
733 if (ack
!= tp
->snd_una
&& before(ack
, tp
->high_seq
)) {
734 tcp_retransmit_skb(sk
,
735 skb_peek(&sk
->write_queue
));
736 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
);
739 /* FACK style, fill any remaining holes in
742 tcp_fack_retransmit(sk
);
748 /* This is Jacobson's slow start and congestion avoidance.
749 * SIGCOMM '88, p. 328.
751 static __inline__
void tcp_cong_avoid(struct tcp_opt
*tp
)
753 if (tp
->snd_cwnd
<= tp
->snd_ssthresh
) {
754 /* In "safe" area, increase. */
757 /* In dangerous area, increase slowly.
758 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
760 if (tp
->snd_cwnd_cnt
>= tp
->snd_cwnd
) {
761 if (tp
->snd_cwnd
< tp
->snd_cwnd_clamp
)
769 /* Remove acknowledged frames from the retransmission queue. */
770 static int tcp_clean_rtx_queue(struct sock
*sk
, __u32 ack
,
771 __u32
*seq
, __u32
*seq_rtt
)
773 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
775 __u32 now
= tcp_time_stamp
;
778 /* If we are retransmitting, and this ACK clears up to
779 * the retransmit head, or further, then clear our state.
781 if (tp
->retrans_head
!= NULL
&&
782 !before(ack
, TCP_SKB_CB(tp
->retrans_head
)->end_seq
))
783 tp
->retrans_head
= NULL
;
785 while((skb
=skb_peek(&sk
->write_queue
)) && (skb
!= tp
->send_head
)) {
786 struct tcp_skb_cb
*scb
= TCP_SKB_CB(skb
);
787 __u8 sacked
= scb
->sacked
;
789 /* If our packet is before the ack sequence we can
790 * discard it as it's confirmed to have arrived at
793 if (after(scb
->end_seq
, ack
))
796 /* Initial outgoing SYN's get put onto the write_queue
797 * just like anything else we transmit. It is not
798 * true data, and if we misinform our callers that
799 * this ACK acks real data, we will erroneously exit
800 * connection startup slow start one packet too
801 * quickly. This is severely frowned upon behavior.
803 if((sacked
& TCPCB_SACKED_RETRANS
) && tp
->retrans_out
)
805 if(!(scb
->flags
& TCPCB_FLAG_SYN
)) {
806 acked
|= FLAG_DATA_ACKED
;
807 if(sacked
& TCPCB_SACKED_RETRANS
)
808 acked
|= FLAG_RETRANS_DATA_ACKED
;
812 acked
|= FLAG_SYN_ACKED
;
813 /* This is pure paranoia. */
814 tp
->retrans_head
= NULL
;
818 *seq_rtt
= now
- scb
->when
;
819 __skb_unlink(skb
, skb
->list
);
825 static void tcp_ack_probe(struct sock
*sk
, __u32 ack
)
827 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
829 /* Our probe was answered. */
832 /* Was it a usable window open? */
834 /* should always be non-null */
835 if (tp
->send_head
!= NULL
&&
836 !before (ack
+ tp
->snd_wnd
, TCP_SKB_CB(tp
->send_head
)->end_seq
)) {
839 tcp_clear_xmit_timer(sk
, TIME_PROBE0
);
841 tcp_reset_xmit_timer(sk
, TIME_PROBE0
,
842 min(tp
->rto
<< tp
->backoff
, 120*HZ
));
846 /* Should we open up the congestion window? */
847 static __inline__
int should_advance_cwnd(struct tcp_opt
*tp
, int flag
)
849 /* Data must have been acked. */
850 if ((flag
& FLAG_DATA_ACKED
) == 0)
853 /* Some of the data acked was retransmitted somehow? */
854 if ((flag
& FLAG_RETRANS_DATA_ACKED
) != 0) {
855 /* We advance in all cases except during
856 * non-FACK fast retransmit/recovery.
858 if (tp
->fackets_out
!= 0 ||
859 tp
->retransmits
!= 0)
862 /* Non-FACK fast retransmit does it's own
863 * congestion window management, don't get
869 /* New non-retransmitted data acked, always advance. */
873 /* Read draft-ietf-tcplw-high-performance before mucking
874 * with this code. (Superceeds RFC1323)
876 static void tcp_ack_saw_tstamp(struct sock
*sk
, struct tcp_opt
*tp
,
877 u32 seq
, u32 ack
, int flag
)
881 /* RTTM Rule: A TSecr value received in a segment is used to
882 * update the averaged RTT measurement only if the segment
883 * acknowledges some new data, i.e., only if it advances the
884 * left edge of the send window.
886 * See draft-ietf-tcplw-high-performance-00, section 3.3.
887 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
889 if (!(flag
& (FLAG_DATA_ACKED
|FLAG_SYN_ACKED
)))
892 seq_rtt
= tcp_time_stamp
- tp
->rcv_tsecr
;
893 tcp_rtt_estimator(tp
, seq_rtt
);
894 if (tp
->retransmits
) {
895 if (tp
->packets_out
== 0) {
902 /* Still retransmitting, use backoff */
904 tp
->rto
= tp
->rto
<< tp
->backoff
;
913 static __inline__
void tcp_ack_packets_out(struct sock
*sk
, struct tcp_opt
*tp
)
915 struct sk_buff
*skb
= skb_peek(&sk
->write_queue
);
917 /* Some data was ACK'd, if still retransmitting (due to a
918 * timeout), resend more of the retransmit queue. The
919 * congestion window is handled properly by that code.
921 if (tp
->retransmits
) {
922 tcp_xmit_retransmit_queue(sk
);
923 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
);
925 __u32 when
= tp
->rto
- (tcp_time_stamp
- TCP_SKB_CB(skb
)->when
);
928 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, when
);
932 /* This routine deals with incoming acks, but not outgoing ones. */
933 static int tcp_ack(struct sock
*sk
, struct tcphdr
*th
,
934 u32 ack_seq
, u32 ack
, int len
)
936 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
942 return(1); /* Dead, can't ack any more so why bother */
944 if (tp
->pending
== TIME_KEEPOPEN
)
947 tp
->rcv_tstamp
= tcp_time_stamp
;
949 /* If the ack is newer than sent or older than previous acks
950 * then we can probably ignore it.
952 if (after(ack
, tp
->snd_nxt
) || before(ack
, tp
->snd_una
))
953 goto uninteresting_ack
;
955 /* If there is data set flag 1 */
956 if (len
!= th
->doff
*4) {
958 tcp_delack_estimator(tp
);
961 /* Update our send window. */
963 /* This is the window update code as per RFC 793
964 * snd_wl{1,2} are used to prevent unordered
965 * segments from shrinking the window
967 if (before(tp
->snd_wl1
, ack_seq
) ||
968 (tp
->snd_wl1
== ack_seq
&& !after(tp
->snd_wl2
, ack
))) {
969 u32 nwin
= ntohs(th
->window
) << tp
->snd_wscale
;
971 if ((tp
->snd_wl2
!= ack
) || (nwin
> tp
->snd_wnd
)) {
972 flag
|= FLAG_WIN_UPDATE
;
975 tp
->snd_wl1
= ack_seq
;
978 if (nwin
> tp
->max_window
)
979 tp
->max_window
= nwin
;
983 /* We passed data and got it acked, remove any soft error
984 * log. Something worked...
988 /* If this ack opens up a zero window, clear backoff. It was
989 * being used to time the probes, and is probably far higher than
990 * it needs to be for normal retransmission.
992 if (tp
->pending
== TIME_PROBE0
)
993 tcp_ack_probe(sk
, ack
);
995 /* See if we can take anything off of the retransmit queue. */
996 flag
|= tcp_clean_rtx_queue(sk
, ack
, &seq
, &seq_rtt
);
998 /* We must do this here, before code below clears out important
999 * state contained in tp->fackets_out and tp->retransmits. -DaveM
1001 if (should_advance_cwnd(tp
, flag
))
1004 /* If we have a timestamp, we always do rtt estimates. */
1005 if (tp
->saw_tstamp
) {
1006 tcp_ack_saw_tstamp(sk
, tp
, seq
, ack
, flag
);
1008 /* If we were retransmiting don't count rtt estimate. */
1009 if (tp
->retransmits
) {
1010 if (tp
->packets_out
== 0) {
1011 tp
->retransmits
= 0;
1012 tp
->fackets_out
= 0;
1013 tp
->retrans_out
= 0;
1016 /* We don't have a timestamp. Can only use
1017 * packets that are not retransmitted to determine
1018 * rtt estimates. Also, we must not reset the
1019 * backoff for rto until we get a non-retransmitted
1020 * packet. This allows us to deal with a situation
1021 * where the network delay has increased suddenly.
1022 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1024 if (flag
& (FLAG_DATA_ACKED
|FLAG_SYN_ACKED
)) {
1025 if(!(flag
& FLAG_RETRANS_DATA_ACKED
)) {
1027 tcp_rtt_estimator(tp
, seq_rtt
);
1035 if (tp
->packets_out
) {
1036 if (flag
& FLAG_DATA_ACKED
)
1037 tcp_ack_packets_out(sk
, tp
);
1039 tcp_clear_xmit_timer(sk
, TIME_RETRANS
);
1042 flag
&= (FLAG_DATA
| FLAG_WIN_UPDATE
);
1043 if ((ack
== tp
->snd_una
&& tp
->packets_out
&& flag
== 0) ||
1044 (tp
->high_seq
!= 0)) {
1045 tcp_fast_retrans(sk
, ack
, flag
);
1047 /* Clear any aborted fast retransmit starts. */
1050 /* It is not a brain fart, I thought a bit now. 8)
1052 * Forward progress is indicated, if:
1053 * 1. the ack acknowledges new data.
1054 * 2. or the ack is duplicate, but it is caused by new segment
1055 * arrival. This case is filtered by:
1056 * - it contains no data, syn or fin.
1057 * - it does not update window.
1058 * 3. or new SACK. It is difficult to check, so that we ignore it.
1060 * Forward progress is also indicated by arrival new data,
1061 * which was caused by window open from our side. This case is more
1062 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1065 if (ack
!= tp
->snd_una
|| (flag
== 0 && !th
->fin
))
1066 dst_confirm(sk
->dst_cache
);
1068 /* Remember the highest ack received. */
1073 SOCK_DEBUG(sk
, "Ack ignored %u %u\n", ack
, tp
->snd_nxt
);
1077 /* New-style handling of TIME_WAIT sockets. */
1079 /* Must be called only from BH context. */
1080 void tcp_timewait_kill(struct tcp_tw_bucket
*tw
)
1082 struct tcp_ehash_bucket
*ehead
;
1083 struct tcp_bind_hashbucket
*bhead
;
1084 struct tcp_bind_bucket
*tb
;
1086 /* Unlink from established hashes. */
1087 ehead
= &tcp_ehash
[tw
->hashent
];
1088 write_lock(&ehead
->lock
);
1090 write_unlock(&ehead
->lock
);
1094 tw
->next
->pprev
= tw
->pprev
;
1095 *(tw
->pprev
) = tw
->next
;
1097 write_unlock(&ehead
->lock
);
1099 /* Disassociate with bind bucket. */
1100 bhead
= &tcp_bhash
[tcp_bhashfn(tw
->num
)];
1101 spin_lock(&bhead
->lock
);
1102 if ((tb
= tw
->tb
) != NULL
) {
1104 tw
->bind_next
->bind_pprev
= tw
->bind_pprev
;
1105 *(tw
->bind_pprev
) = tw
->bind_next
;
1107 if (tb
->owners
== NULL
) {
1109 tb
->next
->pprev
= tb
->pprev
;
1110 *(tb
->pprev
) = tb
->next
;
1111 kmem_cache_free(tcp_bucket_cachep
, tb
);
1114 spin_unlock(&bhead
->lock
);
1116 #ifdef INET_REFCNT_DEBUG
1117 if (atomic_read(&tw
->refcnt
) != 1) {
1118 printk(KERN_DEBUG
"tw_bucket %p refcnt=%d\n", tw
, atomic_read(&tw
->refcnt
));
1124 /* We come here as a special case from the AF specific TCP input processing,
1125 * and the SKB has no owner. Essentially handling this is very simple,
1126 * we just keep silently eating rx'd packets until none show up for the
1127 * entire timeout period. The only special cases are for BSD TIME_WAIT
1128 * reconnects and SYN/RST bits being set in the TCP header.
1132 * * Main purpose of TIME-WAIT state is to close connection gracefully,
1133 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1134 * (and, probably, tail of data) and one or more our ACKs are lost.
1135 * * What is TIME-WAIT timeout? It is associated with maximal packet
1136 * lifetime in the internet, which results in wrong conclusion, that
1137 * it is set to catch "old duplicate segments" wandering out of their path.
1138 * It is not quite correct. This timeout is calculated so that it exceeds
1139 * maximal retransmision timeout enough to allow to lose one (or more)
1140 * segments sent by peer and our ACKs. This time may be calculated from RTO.
1141 * * When TIME-WAIT socket receives RST, it means that another end
1142 * finally closed and we are allowed to kill TIME-WAIT too.
1143 * * Second purpose of TIME-WAIT is catching old duplicate segments.
1144 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
1145 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1146 * * If we invented some more clever way to catch duplicates
1147 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1149 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1150 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1151 * from the very beginning.
1154 tcp_timewait_state_process(struct tcp_tw_bucket
*tw
, struct sk_buff
*skb
,
1155 struct tcphdr
*th
, unsigned len
)
1158 int paws_reject
= 0;
1161 * "When a connection is [...] on TIME-WAIT state [...]
1162 * [a TCP] MAY accept a new SYN from the remote TCP to
1163 * reopen the connection directly, if it:
1165 * (1) assigns its initial sequence number for the new
1166 * connection to be larger than the largest sequence
1167 * number it used on the previous connection incarnation,
1170 * (2) returns to TIME-WAIT state if the SYN turns out
1171 * to be an old duplicate".
1175 if (th
->doff
> (sizeof(struct tcphdr
)>>2) && tw
->ts_recent_stamp
) {
1176 tcp_parse_options(NULL
, th
, &tp
, 0);
1178 paws_reject
= tp
.saw_tstamp
&&
1179 ((s32
)(tp
.rcv_tsval
- tw
->ts_recent
) < 0 &&
1180 xtime
.tv_sec
< tw
->ts_recent_stamp
+ PAWS_24DAYS
);
1184 (TCP_SKB_CB(skb
)->seq
== TCP_SKB_CB(skb
)->end_seq
&&
1185 TCP_SKB_CB(skb
)->seq
== tw
->rcv_nxt
)) {
1186 /* In window segment, it may be only reset or bare ack. */
1189 #ifdef CONFIG_TCP_TW_RECYCLE
1190 /* When recycling, always follow rfc1337,
1191 * but mark bucket as ready to recycling immediately.
1193 if (sysctl_tcp_tw_recycle
) {
1194 /* May kill it now. */
1199 /* This is TIME_WAIT assasination, in two flavors.
1200 * Oh well... nobody has a sufficient solution to this
1203 if(sysctl_tcp_rfc1337
== 0) {
1204 tcp_tw_deschedule(tw
);
1205 tcp_timewait_kill(tw
);
1208 tcp_tw_reschedule(tw
);
1211 if (tp
.saw_tstamp
) {
1212 tw
->ts_recent
= tp
.rcv_tsval
;
1213 tw
->ts_recent_stamp
= xtime
.tv_sec
;
1216 return TCP_TW_SUCCESS
;
1219 /* Out of window segment.
1221 All the segments are ACKed immediately.
1223 The only exception is new SYN. We accept it, if it is
1224 not old duplicate and we are not in danger to be killed
1225 by delayed old duplicates. RFC check is that it has
1226 newer sequence number works at rates <40Mbit/sec.
1227 However, if paws works, it is reliable AND even more,
1228 we even may relax silly seq space cutoff.
1230 RED-PEN: we violate main RFC requirement, if this SYN will appear
1231 old duplicate (i.e. we receive RST in reply to SYN-ACK),
1232 we must return socket to time-wait state. It is not good,
1236 if (th
->syn
&& !th
->rst
&& !th
->ack
&& !paws_reject
&&
1237 (after(TCP_SKB_CB(skb
)->seq
, tw
->rcv_nxt
) ||
1238 (tp
.saw_tstamp
&& tw
->ts_recent
!= tp
.rcv_tsval
))) {
1239 u32 isn
= tw
->snd_nxt
+ 2;
1242 TCP_SKB_CB(skb
)->when
= isn
;
1247 /* In this case we must reset the TIMEWAIT timer.
1249 If it is ACKless SYN it may be both old duplicate
1250 and new good SYN with random sequence number <rcv_nxt.
1251 Do not reschedule in the last case.
1253 if (paws_reject
|| th
->ack
) {
1254 tcp_tw_reschedule(tw
);
1255 #ifdef CONFIG_TCP_TW_RECYCLE
1256 tw
->rto
= min(120*HZ
, tw
->rto
<<1);
1257 tw
->ttd
= jiffies
+ tw
->rto
;
1261 /* Send ACK. Note, we do not put the bucket,
1262 * it will be released by caller.
1267 return TCP_TW_SUCCESS
;
1270 /* Enter the time wait state. This is always called from BH
1271 * context. Essentially we whip up a timewait bucket, copy the
1272 * relevant info into it from the SK, and mess with hash chains
1275 static void __tcp_tw_hashdance(struct sock
*sk
, struct tcp_tw_bucket
*tw
)
1277 struct tcp_ehash_bucket
*ehead
= &tcp_ehash
[sk
->hashent
];
1278 struct tcp_bind_hashbucket
*bhead
;
1279 struct sock
**head
, *sktw
;
1281 write_lock(&ehead
->lock
);
1283 /* Step 1: Remove SK from established hash. */
1286 sk
->next
->pprev
= sk
->pprev
;
1287 *sk
->pprev
= sk
->next
;
1291 /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1292 head
= &(ehead
+ tcp_ehash_size
)->chain
;
1293 sktw
= (struct sock
*)tw
;
1294 if((sktw
->next
= *head
) != NULL
)
1295 (*head
)->pprev
= &sktw
->next
;
1298 atomic_inc(&tw
->refcnt
);
1300 write_unlock(&ehead
->lock
);
1302 /* Step 3: Put TW into bind hash. Original socket stays there too.
1303 Note, that any socket with sk->num!=0 MUST be bound in binding
1304 cache, even if it is closed.
1306 bhead
= &tcp_bhash
[tcp_bhashfn(sk
->num
)];
1307 spin_lock(&bhead
->lock
);
1308 tw
->tb
= (struct tcp_bind_bucket
*)sk
->prev
;
1309 BUG_TRAP(sk
->prev
!=NULL
);
1310 if ((tw
->bind_next
= tw
->tb
->owners
) != NULL
)
1311 tw
->tb
->owners
->bind_pprev
= &tw
->bind_next
;
1312 tw
->tb
->owners
= (struct sock
*)tw
;
1313 tw
->bind_pprev
= &tw
->tb
->owners
;
1314 spin_unlock(&bhead
->lock
);
1316 /* Step 4: Un-charge protocol socket in-use count. */
1321 * Move a socket to time-wait.
1323 void tcp_time_wait(struct sock
*sk
)
1325 struct tcp_tw_bucket
*tw
;
1327 tw
= kmem_cache_alloc(tcp_timewait_cachep
, SLAB_ATOMIC
);
1329 /* Give us an identity. */
1330 tw
->daddr
= sk
->daddr
;
1331 tw
->rcv_saddr
= sk
->rcv_saddr
;
1332 tw
->bound_dev_if
= sk
->bound_dev_if
;
1334 tw
->state
= TCP_TIME_WAIT
;
1335 tw
->sport
= sk
->sport
;
1336 tw
->dport
= sk
->dport
;
1337 tw
->family
= sk
->family
;
1338 tw
->reuse
= sk
->reuse
;
1339 tw
->hashent
= sk
->hashent
;
1340 tw
->rcv_nxt
= sk
->tp_pinfo
.af_tcp
.rcv_nxt
;
1341 tw
->snd_nxt
= sk
->tp_pinfo
.af_tcp
.snd_nxt
;
1342 tw
->ts_recent
= sk
->tp_pinfo
.af_tcp
.ts_recent
;
1343 tw
->ts_recent_stamp
= sk
->tp_pinfo
.af_tcp
.ts_recent_stamp
;
1344 #ifdef CONFIG_TCP_TW_RECYCLE
1345 tw
->rto
= sk
->tp_pinfo
.af_tcp
.rto
;
1346 tw
->ttd
= jiffies
+ 2*tw
->rto
;
1348 atomic_set(&tw
->refcnt
, 0);
1350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1351 if(tw
->family
== PF_INET6
) {
1352 memcpy(&tw
->v6_daddr
,
1353 &sk
->net_pinfo
.af_inet6
.daddr
,
1354 sizeof(struct in6_addr
));
1355 memcpy(&tw
->v6_rcv_saddr
,
1356 &sk
->net_pinfo
.af_inet6
.rcv_saddr
,
1357 sizeof(struct in6_addr
));
1360 /* Linkage updates. */
1361 __tcp_tw_hashdance(sk
, tw
);
1363 /* Get the TIME_WAIT timeout firing. */
1364 tcp_tw_schedule(tw
);
1367 if(sk
->state
== TCP_ESTABLISHED
)
1368 tcp_statistics
.TcpCurrEstab
--;
1369 sk
->state
= TCP_CLOSE
;
1371 /* Sorry, we're out of memory, just CLOSE this
1372 * socket up. We've got bigger problems than
1373 * non-graceful socket closings.
1375 tcp_set_state(sk
, TCP_CLOSE
);
1378 tcp_update_metrics(sk
);
1379 tcp_clear_xmit_timers(sk
);
1384 * Process the FIN bit. This now behaves as it is supposed to work
1385 * and the FIN takes effect when it is validly part of sequence
1386 * space. Not before when we get holes.
1388 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1389 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1392 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1393 * close and we go into CLOSING (and later onto TIME-WAIT)
1395 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1398 static void tcp_fin(struct sk_buff
*skb
, struct sock
*sk
, struct tcphdr
*th
)
1400 sk
->tp_pinfo
.af_tcp
.fin_seq
= TCP_SKB_CB(skb
)->end_seq
;
1405 wake_up_interruptible(sk
->sleep
);
1406 sock_wake_async(sk
->socket
, 1);
1411 case TCP_ESTABLISHED
:
1412 /* Move to CLOSE_WAIT */
1413 tcp_set_state(sk
, TCP_CLOSE_WAIT
);
1416 case TCP_CLOSE_WAIT
:
1418 /* Received a retransmission of the FIN, do
1423 /* RFC793: Remain in the LAST-ACK state. */
1427 /* This case occurs when a simultaneous close
1428 * happens, we must ack the received FIN and
1429 * enter the CLOSING state.
1431 tcp_set_state(sk
, TCP_CLOSING
);
1434 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1438 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1439 * cases we should never reach this piece of code.
1441 printk("tcp_fin: Impossible, sk->state=%d\n", sk
->state
);
1446 /* These routines update the SACK block as out-of-order packets arrive or
1447 * in-order packets close up the sequence space.
1449 static void tcp_sack_maybe_coalesce(struct tcp_opt
*tp
, struct tcp_sack_block
*sp
)
1451 int this_sack
, num_sacks
= tp
->num_sacks
;
1452 struct tcp_sack_block
*swalk
= &tp
->selective_acks
[0];
1454 /* If more than one SACK block, see if the recent change to SP eats into
1455 * or hits the sequence space of other SACK blocks, if so coalesce.
1457 if(num_sacks
!= 1) {
1458 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, swalk
++) {
1462 /* First case, bottom of SP moves into top of the
1463 * sequence space of SWALK.
1465 if(between(sp
->start_seq
, swalk
->start_seq
, swalk
->end_seq
)) {
1466 sp
->start_seq
= swalk
->start_seq
;
1469 /* Second case, top of SP moves into bottom of the
1470 * sequence space of SWALK.
1472 if(between(sp
->end_seq
, swalk
->start_seq
, swalk
->end_seq
)) {
1473 sp
->end_seq
= swalk
->end_seq
;
1478 /* SP is the only SACK, or no coalescing cases found. */
1482 /* Zap SWALK, by moving every further SACK up by one slot.
1483 * Decrease num_sacks.
1485 for(; this_sack
< num_sacks
-1; this_sack
++, swalk
++) {
1486 struct tcp_sack_block
*next
= (swalk
+ 1);
1487 swalk
->start_seq
= next
->start_seq
;
1488 swalk
->end_seq
= next
->end_seq
;
1493 static __inline__
void tcp_sack_swap(struct tcp_sack_block
*sack1
, struct tcp_sack_block
*sack2
)
1497 tmp
= sack1
->start_seq
;
1498 sack1
->start_seq
= sack2
->start_seq
;
1499 sack2
->start_seq
= tmp
;
1501 tmp
= sack1
->end_seq
;
1502 sack1
->end_seq
= sack2
->end_seq
;
1503 sack2
->end_seq
= tmp
;
1506 static void tcp_sack_new_ofo_skb(struct sock
*sk
, struct sk_buff
*skb
)
1508 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1509 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1510 int cur_sacks
= tp
->num_sacks
;
1515 /* Optimize for the common case, new ofo frames arrive
1516 * "in order". ;-) This also satisfies the requirements
1517 * of RFC2018 about ordering of SACKs.
1519 if(sp
->end_seq
== TCP_SKB_CB(skb
)->seq
) {
1520 sp
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1521 tcp_sack_maybe_coalesce(tp
, sp
);
1522 } else if(sp
->start_seq
== TCP_SKB_CB(skb
)->end_seq
) {
1523 /* Re-ordered arrival, in this case, can be optimized
1526 sp
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1527 tcp_sack_maybe_coalesce(tp
, sp
);
1529 struct tcp_sack_block
*swap
= sp
+ 1;
1530 int this_sack
, max_sacks
= (tp
->tstamp_ok
? 3 : 4);
1532 /* Oh well, we have to move things around.
1533 * Try to find a SACK we can tack this onto.
1536 for(this_sack
= 1; this_sack
< cur_sacks
; this_sack
++, swap
++) {
1537 if((swap
->end_seq
== TCP_SKB_CB(skb
)->seq
) ||
1538 (swap
->start_seq
== TCP_SKB_CB(skb
)->end_seq
)) {
1539 if(swap
->end_seq
== TCP_SKB_CB(skb
)->seq
)
1540 swap
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1542 swap
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1543 tcp_sack_swap(sp
, swap
);
1544 tcp_sack_maybe_coalesce(tp
, sp
);
1549 /* Could not find an adjacent existing SACK, build a new one,
1550 * put it at the front, and shift everyone else down. We
1551 * always know there is at least one SACK present already here.
1553 * If the sack array is full, forget about the last one.
1555 if (cur_sacks
>= max_sacks
) {
1559 while(cur_sacks
>= 1) {
1560 struct tcp_sack_block
*this = &tp
->selective_acks
[cur_sacks
];
1561 struct tcp_sack_block
*prev
= (this - 1);
1562 this->start_seq
= prev
->start_seq
;
1563 this->end_seq
= prev
->end_seq
;
1568 /* Build the new head SACK, and we're done. */
1569 sp
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1570 sp
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1575 static void tcp_sack_remove_skb(struct tcp_opt
*tp
, struct sk_buff
*skb
)
1577 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1578 int num_sacks
= tp
->num_sacks
;
1581 /* This is an in order data segment _or_ an out-of-order SKB being
1582 * moved to the receive queue, so we know this removed SKB will eat
1583 * from the front of a SACK.
1585 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, sp
++) {
1586 /* Check if the start of the sack is covered by skb. */
1587 if(!before(sp
->start_seq
, TCP_SKB_CB(skb
)->seq
) &&
1588 before(sp
->start_seq
, TCP_SKB_CB(skb
)->end_seq
))
1592 /* This should only happen if so many SACKs get built that some get
1593 * pushed out before we get here, or we eat some in sequence packets
1594 * which are before the first SACK block.
1596 if(this_sack
>= num_sacks
)
1599 sp
->start_seq
= TCP_SKB_CB(skb
)->end_seq
;
1600 if(!before(sp
->start_seq
, sp
->end_seq
)) {
1601 /* Zap this SACK, by moving forward any other SACKS. */
1602 for(this_sack
+= 1; this_sack
< num_sacks
; this_sack
++, sp
++) {
1603 struct tcp_sack_block
*next
= (sp
+ 1);
1604 sp
->start_seq
= next
->start_seq
;
1605 sp
->end_seq
= next
->end_seq
;
1611 static void tcp_sack_extend(struct tcp_opt
*tp
, struct sk_buff
*old_skb
, struct sk_buff
*new_skb
)
1613 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1614 int num_sacks
= tp
->num_sacks
;
1617 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, sp
++) {
1618 if(sp
->end_seq
== TCP_SKB_CB(old_skb
)->end_seq
)
1621 if(this_sack
>= num_sacks
)
1623 sp
->end_seq
= TCP_SKB_CB(new_skb
)->end_seq
;
1626 /* This one checks to see if we can put data from the
1627 * out_of_order queue into the receive_queue.
1629 static void tcp_ofo_queue(struct sock
*sk
)
1631 struct sk_buff
*skb
;
1632 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1634 while ((skb
= skb_peek(&tp
->out_of_order_queue
))) {
1635 if (after(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
))
1638 if (!after(TCP_SKB_CB(skb
)->end_seq
, tp
->rcv_nxt
)) {
1639 SOCK_DEBUG(sk
, "ofo packet was already received \n");
1640 __skb_unlink(skb
, skb
->list
);
1644 SOCK_DEBUG(sk
, "ofo requeuing : rcv_next %X seq %X - %X\n",
1645 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
,
1646 TCP_SKB_CB(skb
)->end_seq
);
1649 tcp_sack_remove_skb(tp
, skb
);
1650 __skb_unlink(skb
, skb
->list
);
1651 __skb_queue_tail(&sk
->receive_queue
, skb
);
1652 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1654 tcp_fin(skb
, sk
, skb
->h
.th
);
1658 static void tcp_data_queue(struct sock
*sk
, struct sk_buff
*skb
)
1660 struct sk_buff
*skb1
;
1661 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1663 /* Queue data for delivery to the user.
1664 * Packets in sequence go to the receive queue.
1665 * Out of sequence packets to the out_of_order_queue.
1667 if (TCP_SKB_CB(skb
)->seq
== tp
->rcv_nxt
) {
1668 /* Ok. In sequence. */
1670 dst_confirm(sk
->dst_cache
);
1671 __skb_queue_tail(&sk
->receive_queue
, skb
);
1672 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1673 if(skb
->h
.th
->fin
) {
1674 tcp_fin(skb
, sk
, skb
->h
.th
);
1676 tcp_remember_ack(tp
, skb
->h
.th
, skb
);
1678 /* This may have eaten into a SACK block. */
1679 if(tp
->sack_ok
&& tp
->num_sacks
)
1680 tcp_sack_remove_skb(tp
, skb
);
1683 /* Turn on fast path. */
1684 if (skb_queue_len(&tp
->out_of_order_queue
) == 0)
1685 tp
->pred_flags
= htonl(((tp
->tcp_header_len
>> 2) << 28) |
1686 ntohl(TCP_FLAG_ACK
) |
1691 /* An old packet, either a retransmit or some packet got lost. */
1692 if (!after(TCP_SKB_CB(skb
)->end_seq
, tp
->rcv_nxt
)) {
1693 /* A retransmit, 2nd most common case. Force an imediate ack. */
1694 SOCK_DEBUG(sk
, "retransmit received: seq %X\n", TCP_SKB_CB(skb
)->seq
);
1695 tcp_enter_quickack_mode(tp
);
1700 if (before(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
)) {
1701 /* Partial packet, seq < rcv_next < end_seq */
1702 SOCK_DEBUG(sk
, "partial packet: rcv_next %X seq %X - %X\n",
1703 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
,
1704 TCP_SKB_CB(skb
)->end_seq
);
1709 /* Ok. This is an out_of_order segment, force an ack. */
1711 tcp_enter_quickack_mode(tp
);
1713 /* Disable header prediction. */
1716 SOCK_DEBUG(sk
, "out of order segment: rcv_next %X seq %X - %X\n",
1717 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
);
1719 if (skb_peek(&tp
->out_of_order_queue
) == NULL
) {
1720 /* Initial out of order segment, build 1 SACK. */
1723 tp
->selective_acks
[0].start_seq
= TCP_SKB_CB(skb
)->seq
;
1724 tp
->selective_acks
[0].end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1726 __skb_queue_head(&tp
->out_of_order_queue
,skb
);
1728 for(skb1
=tp
->out_of_order_queue
.prev
; ; skb1
= skb1
->prev
) {
1729 /* Already there. */
1730 if (TCP_SKB_CB(skb
)->seq
== TCP_SKB_CB(skb1
)->seq
) {
1731 if (skb
->len
>= skb1
->len
) {
1733 tcp_sack_extend(tp
, skb1
, skb
);
1734 __skb_append(skb1
, skb
);
1735 __skb_unlink(skb1
, skb1
->list
);
1738 /* A duplicate, smaller than what is in the
1739 * out-of-order queue right now, toss it.
1746 if (after(TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb1
)->seq
)) {
1747 __skb_append(skb1
, skb
);
1749 tcp_sack_new_ofo_skb(sk
, skb
);
1753 /* See if we've hit the start. If so insert. */
1754 if (skb1
== skb_peek(&tp
->out_of_order_queue
)) {
1755 __skb_queue_head(&tp
->out_of_order_queue
,skb
);
1757 tcp_sack_new_ofo_skb(sk
, skb
);
1766 * This routine handles the data. If there is room in the buffer,
1767 * it will be have already been moved into it. If there is no
1768 * room, then we will just have to discard the packet.
1771 static int tcp_data(struct sk_buff
*skb
, struct sock
*sk
, unsigned int len
)
1774 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1777 skb_pull(skb
, th
->doff
*4);
1778 skb_trim(skb
, len
- (th
->doff
*4));
1780 if (skb
->len
== 0 && !th
->fin
)
1784 * If our receive queue has grown past its limits shrink it.
1785 * Make sure to do this before moving snd_nxt, otherwise
1786 * data might be acked for that we don't have enough room.
1788 if (atomic_read(&sk
->rmem_alloc
) > sk
->rcvbuf
) {
1789 if (prune_queue(sk
) < 0) {
1790 /* Still not enough room. That can happen when
1791 * skb->true_size differs significantly from skb->len.
1797 tcp_data_queue(sk
, skb
);
1799 if (before(tp
->rcv_nxt
, tp
->copied_seq
)) {
1800 printk(KERN_DEBUG
"*** tcp.c:tcp_data bug acked < copied\n");
1801 tp
->rcv_nxt
= tp
->copied_seq
;
1804 /* Above, tcp_data_queue() increments delayed_acks appropriately.
1805 * Now tell the user we may have some data.
1808 wake_up_interruptible(sk
->sleep
);
1809 sock_wake_async(sk
->socket
,1);
1814 static void __tcp_data_snd_check(struct sock
*sk
, struct sk_buff
*skb
)
1816 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1818 if (!after(TCP_SKB_CB(skb
)->end_seq
, tp
->snd_una
+ tp
->snd_wnd
) &&
1819 tcp_packets_in_flight(tp
) < tp
->snd_cwnd
) {
1820 /* Put more data onto the wire. */
1822 } else if (tp
->packets_out
== 0 && !tp
->pending
) {
1823 /* Start probing the receivers window. */
1824 tcp_reset_xmit_timer(sk
, TIME_PROBE0
, tp
->rto
);
1828 static __inline__
void tcp_data_snd_check(struct sock
*sk
)
1830 struct sk_buff
*skb
= sk
->tp_pinfo
.af_tcp
.send_head
;
1833 __tcp_data_snd_check(sk
, skb
);
1837 * Adapt the MSS value used to make delayed ack decision to the
1840 * The constant 536 hasn't any good meaning. In IPv4 world
1841 * MTU may be smaller, though it contradicts to RFC1122, which
1842 * states that MSS must be at least 536.
1843 * We use the constant to do not ACK each second
1844 * packet in a stream of tiny size packets.
1845 * It means that super-low mtu links will be aggressively delacked.
1846 * Seems, it is even good. If they have so low mtu, they are weirdly
1849 * AK: BTW it may be useful to add an option to lock the rcv_mss.
1850 * this way the beowulf people wouldn't need ugly patches to get the
1851 * ack frequencies they want and it would be an elegant way to tune delack.
1853 static __inline__
void tcp_measure_rcv_mss(struct sock
*sk
, struct sk_buff
*skb
)
1855 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1856 unsigned int len
, lss
;
1858 lss
= tp
->last_seg_size
;
1859 tp
->last_seg_size
= 0;
1861 /* skb->len may jitter because of SACKs, even if peer
1862 * sends good full-sized frames.
1865 if (len
>= tp
->rcv_mss
) {
1868 /* Otherwise, we make more careful check taking into account,
1869 * that SACKs block is variable.
1871 * "len" is invariant segment length, including TCP header.
1873 len
= skb
->tail
- skb
->h
.raw
;
1874 if (len
>= 536 + sizeof(struct tcphdr
)) {
1875 /* Subtract also invariant (if peer is RFC compliant),
1876 * tcp header plus fixed timestamp option length.
1877 * Resulting "len" is MSS free of SACK jitter.
1879 len
-= tp
->tcp_header_len
;
1882 tp
->last_seg_size
= len
;
1888 * Check if sending an ack is needed.
1890 static __inline__
void __tcp_ack_snd_check(struct sock
*sk
, int ofo_possible
)
1892 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1894 /* This also takes care of updating the window.
1895 * This if statement needs to be simplified.
1897 * Rules for delaying an ack:
1898 * - delay time <= 0.5 HZ
1899 * - we don't have a window update to send
1900 * - must send at least every 2 full sized packets
1901 * - must send an ACK if we have any out of order data
1903 * With an extra heuristic to handle loss of packet
1904 * situations and also helping the sender leave slow
1905 * start in an expediant manner.
1908 /* Two full frames received or... */
1909 if (((tp
->rcv_nxt
- tp
->rcv_wup
) >= tp
->rcv_mss
* MAX_DELAY_ACK
) ||
1910 /* We will update the window "significantly" or... */
1911 tcp_raise_window(sk
) ||
1912 /* We entered "quick ACK" mode or... */
1913 tcp_in_quickack_mode(tp
) ||
1914 /* We have out of order data */
1915 (ofo_possible
&& (skb_peek(&tp
->out_of_order_queue
) != NULL
))) {
1916 /* Then ack it now */
1919 /* Else, send delayed ack. */
1920 tcp_send_delayed_ack(sk
, HZ
/2);
1924 static __inline__
void tcp_ack_snd_check(struct sock
*sk
)
1926 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1927 if (tp
->delayed_acks
== 0) {
1928 /* We sent a data segment already. */
1931 __tcp_ack_snd_check(sk
, 1);
1936 * This routine is only called when we have urgent data
1937 * signalled. Its the 'slow' part of tcp_urg. It could be
1938 * moved inline now as tcp_urg is only called from one
1939 * place. We handle URGent data wrong. We have to - as
1940 * BSD still doesn't use the correction from RFC961.
1941 * For 1003.1g we should support a new option TCP_STDURG to permit
1942 * either form (or just set the sysctl tcp_stdurg).
1945 static void tcp_check_urg(struct sock
* sk
, struct tcphdr
* th
)
1947 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1948 u32 ptr
= ntohs(th
->urg_ptr
);
1950 if (ptr
&& !sysctl_tcp_stdurg
)
1952 ptr
+= ntohl(th
->seq
);
1954 /* Ignore urgent data that we've already seen and read. */
1955 if (after(tp
->copied_seq
, ptr
))
1958 /* Do we already have a newer (or duplicate) urgent pointer? */
1959 if (tp
->urg_data
&& !after(ptr
, tp
->urg_seq
))
1962 /* Tell the world about our new urgent pointer. */
1963 if (sk
->proc
!= 0) {
1965 kill_proc(sk
->proc
, SIGURG
, 1);
1967 kill_pg(-sk
->proc
, SIGURG
, 1);
1970 /* We may be adding urgent data when the last byte read was
1971 * urgent. To do this requires some care. We cannot just ignore
1972 * tp->copied_seq since we would read the last urgent byte again
1973 * as data, nor can we alter copied_seq until this data arrives
1974 * or we break the sematics of SIOCATMARK (and thus sockatmark())
1976 if (tp
->urg_seq
== tp
->copied_seq
)
1977 tp
->copied_seq
++; /* Move the copied sequence on correctly */
1978 tp
->urg_data
= URG_NOTYET
;
1981 /* Disable header prediction. */
1985 /* This is the 'fast' part of urgent handling. */
1986 static inline void tcp_urg(struct sock
*sk
, struct tcphdr
*th
, unsigned long len
)
1988 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1990 /* Check if we get a new urgent pointer - normally not. */
1992 tcp_check_urg(sk
,th
);
1994 /* Do we wait for any urgent data? - normally not... */
1995 if (tp
->urg_data
== URG_NOTYET
) {
1996 u32 ptr
= tp
->urg_seq
- ntohl(th
->seq
) + (th
->doff
*4);
1998 /* Is the urgent pointer pointing into this packet? */
2000 tp
->urg_data
= URG_VALID
| *(ptr
+ (unsigned char *) th
);
2002 sk
->data_ready(sk
,0);
2007 /* Clean the out_of_order queue if we can, trying to get
2008 * the socket within its memory limits again.
2010 * Return less than zero if we should start dropping frames
2011 * until the socket owning process reads some of the data
2012 * to stabilize the situation.
2014 static int prune_queue(struct sock
*sk
)
2016 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2017 struct sk_buff
* skb
;
2019 SOCK_DEBUG(sk
, "prune_queue: c=%x\n", tp
->copied_seq
);
2021 net_statistics
.PruneCalled
++;
2023 /* First, purge the out_of_order queue. */
2024 skb
= __skb_dequeue_tail(&tp
->out_of_order_queue
);
2027 do { net_statistics
.OfoPruned
+= skb
->len
;
2029 skb
= __skb_dequeue_tail(&tp
->out_of_order_queue
);
2030 } while(skb
!= NULL
);
2032 /* Reset SACK state. A conforming SACK implementation will
2033 * do the same at a timeout based retransmit. When a connection
2034 * is in a sad state like this, we care only about integrity
2035 * of the connection not performance.
2041 /* If we are really being abused, tell the caller to silently
2042 * drop receive data on the floor. It will get retransmitted
2043 * and hopefully then we'll have sufficient space.
2045 * We used to try to purge the in-order packets too, but that
2046 * turns out to be deadly and fraught with races. Consider:
2048 * 1) If we acked the data, we absolutely cannot drop the
2049 * packet. This data would then never be retransmitted.
2050 * 2) It is possible, with a proper sequence of events involving
2051 * delayed acks and backlog queue handling, to have the user
2052 * read the data before it gets acked. The previous code
2053 * here got this wrong, and it lead to data corruption.
2054 * 3) Too much state changes happen when the FIN arrives, so once
2055 * we've seen that we can't remove any in-order data safely.
2057 * The net result is that removing in-order receive data is too
2058 * complex for anyones sanity. So we don't do it anymore. But
2059 * if we are really having our buffer space abused we stop accepting
2062 * FIXME: it should recompute SACK state and only remove enough
2063 * buffers to get into bounds again. The current scheme loses
2064 * badly sometimes on links with large RTT, especially when
2065 * the driver has high overhead per skb.
2066 * (increasing the rcvbuf is not enough because it inflates the
2067 * the window too, disabling flow control effectively) -AK
2069 if(atomic_read(&sk
->rmem_alloc
) < (sk
->rcvbuf
<< 1))
2072 /* Massive buffer overcommit. */
2077 * TCP receive function for the ESTABLISHED state.
2079 * It is split into a fast path and a slow path. The fast path is
2081 * - A zero window was announced from us - zero window probing
2082 * is only handled properly in the slow path.
2083 * - Out of order segments arrived.
2084 * - Urgent data is expected.
2085 * - There is no buffer space left
2086 * - Unexpected TCP flags/window values/header lengths are received
2087 * (detected by checking the TCP header against pred_flags)
2088 * - Data is sent in both directions. Fast path only supports pure senders
2089 * or pure receivers (this means either the sequence number or the ack
2090 * value must stay constant)
2091 * - Unexpected TCP option.
2093 * When these conditions are not satisfied it drops into a standard
2094 * receive procedure patterned after RFC793 to handle all cases.
2095 * The first three cases are guaranteed by proper pred_flags setting,
2096 * the rest is checked inline. Fast processing is turned on in
2097 * tcp_data_queue when everything is OK.
2099 int tcp_rcv_established(struct sock
*sk
, struct sk_buff
*skb
,
2100 struct tcphdr
*th
, unsigned len
)
2102 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2105 * Header prediction.
2106 * The code losely follows the one in the famous
2107 * "30 instruction TCP receive" Van Jacobson mail.
2109 * Van's trick is to deposit buffers into socket queue
2110 * on a device interrupt, to call tcp_recv function
2111 * on the receive process context and checksum and copy
2112 * the buffer to user space. smart...
2114 * Our current scheme is not silly either but we take the
2115 * extra cost of the net_bh soft interrupt processing...
2116 * We do checksum and copy also but from device to kernel.
2120 /* RED-PEN. Using static variables to pass function arguments
2121 * cannot be good idea...
2125 /* pred_flags is 0xS?10 << 16 + snd_wnd
2126 * if header_predition is to be made
2127 * 'S' will always be tp->tcp_header_len >> 2
2128 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2129 * turn it off (when there are holes in the receive
2130 * space for instance)
2131 * PSH flag is ignored.
2134 if ((tcp_flag_word(th
) & ~(TCP_RESERVED_BITS
|TCP_FLAG_PSH
)) == tp
->pred_flags
&&
2135 TCP_SKB_CB(skb
)->seq
== tp
->rcv_nxt
) {
2136 int tcp_header_len
= th
->doff
*4;
2138 /* Timestamp header prediction */
2140 /* Non-standard header f.e. SACKs -> slow path */
2141 if (tcp_header_len
!= tp
->tcp_header_len
)
2144 /* Check timestamp */
2145 if (tcp_header_len
== sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
) {
2146 __u32
*ptr
= (__u32
*)(th
+ 1);
2148 /* No? Slow path! */
2149 if (*ptr
!= __constant_ntohl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
2150 | (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
))
2155 tp
->rcv_tsval
= ntohl(*ptr
);
2157 tp
->rcv_tsecr
= ntohl(*ptr
);
2159 /* If PAWS failed, check it more carefully in slow path */
2160 if ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) < 0)
2163 /* Predicted packet is in window by definition.
2164 seq == rcv_nxt and last_ack_sent <= rcv_nxt.
2165 Hence, check seq<=last_ack_sent reduces to:
2167 if (tp
->rcv_nxt
== tp
->last_ack_sent
) {
2168 tp
->ts_recent
= tp
->rcv_tsval
;
2169 tp
->ts_recent_stamp
= xtime
.tv_sec
;
2173 if (len
<= tcp_header_len
) {
2174 /* Bulk data transfer: sender */
2175 if (len
== tcp_header_len
) {
2176 tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
,
2177 TCP_SKB_CB(skb
)->ack_seq
, len
);
2179 tcp_data_snd_check(sk
);
2181 } else { /* Header too small */
2182 tcp_statistics
.TcpInErrs
++;
2185 } else if (TCP_SKB_CB(skb
)->ack_seq
== tp
->snd_una
&&
2186 atomic_read(&sk
->rmem_alloc
) <= sk
->rcvbuf
) {
2187 /* Bulk data transfer: receiver */
2188 __skb_pull(skb
,tcp_header_len
);
2190 /* Is it possible to simplify this? */
2191 tcp_measure_rcv_mss(sk
, skb
);
2193 /* DO NOT notify forward progress here.
2194 * It saves dozen of CPU instructions in fast path. --ANK
2195 * And where is it signaled then ? -AK
2197 __skb_queue_tail(&sk
->receive_queue
, skb
);
2198 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
2200 /* FIN bit check is not done since if FIN is set in
2201 * this frame, the pred_flags won't match up. -DaveM
2203 wake_up_interruptible(sk
->sleep
);
2204 sock_wake_async(sk
->socket
,1);
2205 tcp_delack_estimator(tp
);
2207 tcp_remember_ack(tp
, th
, skb
);
2209 __tcp_ack_snd_check(sk
, 0);
2212 /* Packet is in sequence, flags are trivial;
2213 * only ACK is strange or we are tough on memory.
2221 * RFC1323: H1. Apply PAWS check first.
2223 if (tcp_fast_parse_options(sk
, th
, tp
) && tp
->saw_tstamp
&&
2224 tcp_paws_discard(tp
, skb
)) {
2229 /* Resets are accepted even if PAWS failed.
2231 ts_recent update must be made after we are sure
2232 that the packet is in window.
2237 * Standard slow path.
2240 if (!tcp_sequence(tp
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
)) {
2241 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2242 * (RST) segments are validated by checking their SEQ-fields."
2243 * And page 69: "If an incoming segment is not acceptable,
2244 * an acknowledgment should be sent in reply (unless the RST bit
2245 * is set, if so drop the segment and return)".
2249 if (after(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
)) {
2250 SOCK_DEBUG(sk
, "seq:%d end:%d wup:%d wnd:%d\n",
2251 TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
,
2252 tp
->rcv_wup
, tp
->rcv_wnd
);
2263 if (tp
->saw_tstamp
) {
2264 tcp_replace_ts_recent(sk
, tp
,
2265 TCP_SKB_CB(skb
)->seq
);
2268 if(th
->syn
&& TCP_SKB_CB(skb
)->seq
!= tp
->syn_seq
) {
2269 SOCK_DEBUG(sk
, "syn in established state\n");
2270 tcp_statistics
.TcpInErrs
++;
2277 tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->ack_seq
, len
);
2279 /* Process urgent data. */
2280 tcp_urg(sk
, th
, len
);
2283 /* step 7: process the segment text */
2284 int queued
= tcp_data(skb
, sk
, len
);
2286 tcp_measure_rcv_mss(sk
, skb
);
2288 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2289 if(sk
->state
!= TCP_CLOSE
) {
2290 tcp_data_snd_check(sk
);
2291 tcp_ack_snd_check(sk
);
2304 /* This is not only more efficient than what we used to do, it eliminates
2305 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2307 * Actually, we could lots of memory writes here. tp of listening
2308 * socket contains all necessary default parameters.
2310 struct sock
*tcp_create_openreq_child(struct sock
*sk
, struct open_request
*req
, struct sk_buff
*skb
)
2312 struct sock
*newsk
= sk_alloc(PF_INET
, GFP_ATOMIC
, 0);
2315 struct tcp_opt
*newtp
;
2316 #ifdef CONFIG_FILTER
2317 struct sk_filter
*filter
;
2320 memcpy(newsk
, sk
, sizeof(*newsk
));
2321 newsk
->state
= TCP_SYN_RECV
;
2324 newsk
->pprev
= NULL
;
2327 /* Clone the TCP header template */
2328 newsk
->dport
= req
->rmt_port
;
2330 sock_lock_init(newsk
);
2332 atomic_set(&newsk
->rmem_alloc
, 0);
2333 skb_queue_head_init(&newsk
->receive_queue
);
2334 atomic_set(&newsk
->wmem_alloc
, 0);
2335 skb_queue_head_init(&newsk
->write_queue
);
2336 atomic_set(&newsk
->omem_alloc
, 0);
2340 newsk
->backlog
.head
= newsk
->backlog
.tail
= NULL
;
2341 skb_queue_head_init(&newsk
->error_queue
);
2342 newsk
->write_space
= tcp_write_space
;
2343 #ifdef CONFIG_FILTER
2344 if ((filter
= newsk
->filter
) != NULL
)
2345 sk_filter_charge(newsk
, filter
);
2348 /* Now setup tcp_opt */
2349 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
2350 newtp
->pred_flags
= 0;
2351 newtp
->rcv_nxt
= req
->rcv_isn
+ 1;
2352 newtp
->snd_nxt
= req
->snt_isn
+ 1;
2353 newtp
->snd_una
= req
->snt_isn
+ 1;
2356 newtp
->snd_wl1
= req
->rcv_isn
;
2357 newtp
->snd_wl2
= req
->snt_isn
;
2359 /* RFC1323: The window in SYN & SYN/ACK segments
2362 newtp
->snd_wnd
= ntohs(skb
->h
.th
->window
);
2364 newtp
->max_window
= newtp
->snd_wnd
;
2366 newtp
->retransmits
= 0;
2367 newtp
->last_ack_sent
= req
->rcv_isn
+ 1;
2369 newtp
->mdev
= TCP_TIMEOUT_INIT
;
2371 /* So many TCP implementations out there (incorrectly) count the
2372 * initial SYN frame in their delayed-ACK and congestion control
2373 * algorithms that we must have the following bandaid to talk
2374 * efficiently to them. -DaveM
2376 newtp
->snd_cwnd
= 2;
2378 newtp
->rto
= TCP_TIMEOUT_INIT
;
2379 newtp
->packets_out
= 0;
2380 newtp
->fackets_out
= 0;
2381 newtp
->retrans_out
= 0;
2382 newtp
->high_seq
= 0;
2383 newtp
->snd_ssthresh
= 0x7fffffff;
2384 newtp
->snd_cwnd_cnt
= 0;
2385 newtp
->dup_acks
= 0;
2386 newtp
->delayed_acks
= 0;
2387 init_timer(&newtp
->retransmit_timer
);
2388 newtp
->retransmit_timer
.function
= &tcp_retransmit_timer
;
2389 newtp
->retransmit_timer
.data
= (unsigned long) newsk
;
2390 init_timer(&newtp
->delack_timer
);
2391 newtp
->delack_timer
.function
= &tcp_delack_timer
;
2392 newtp
->delack_timer
.data
= (unsigned long) newsk
;
2393 skb_queue_head_init(&newtp
->out_of_order_queue
);
2394 newtp
->send_head
= newtp
->retrans_head
= NULL
;
2395 newtp
->rcv_wup
= req
->rcv_isn
+ 1;
2396 newtp
->write_seq
= req
->snt_isn
+ 1;
2397 newtp
->copied_seq
= req
->rcv_isn
+ 1;
2399 newtp
->saw_tstamp
= 0;
2401 init_timer(&newtp
->probe_timer
);
2402 newtp
->probe_timer
.function
= &tcp_probe_timer
;
2403 newtp
->probe_timer
.data
= (unsigned long) newsk
;
2404 newtp
->probes_out
= 0;
2405 newtp
->syn_seq
= req
->rcv_isn
;
2406 newtp
->fin_seq
= req
->rcv_isn
;
2407 newtp
->urg_data
= 0;
2408 tcp_synq_init(newtp
);
2409 newtp
->syn_backlog
= 0;
2410 if (skb
->len
>= 536)
2411 newtp
->last_seg_size
= skb
->len
;
2413 /* Back to base struct sock members. */
2415 newsk
->ack_backlog
= 0;
2416 newsk
->max_ack_backlog
= SOMAXCONN
;
2417 newsk
->priority
= 0;
2418 atomic_set(&newsk
->refcnt
, 1);
2419 atomic_inc(&inet_sock_nr
);
2421 spin_lock_init(&sk
->timer_lock
);
2422 init_timer(&newsk
->timer
);
2423 newsk
->timer
.function
= &tcp_keepalive_timer
;
2424 newsk
->timer
.data
= (unsigned long) newsk
;
2425 if (newsk
->keepopen
)
2426 tcp_reset_keepalive_timer(newsk
, keepalive_time_when(newtp
));
2427 newsk
->socket
= NULL
;
2428 newsk
->sleep
= NULL
;
2430 newtp
->tstamp_ok
= req
->tstamp_ok
;
2431 if((newtp
->sack_ok
= req
->sack_ok
) != 0)
2432 newtp
->num_sacks
= 0;
2433 newtp
->window_clamp
= req
->window_clamp
;
2434 newtp
->rcv_wnd
= req
->rcv_wnd
;
2435 newtp
->wscale_ok
= req
->wscale_ok
;
2436 if (newtp
->wscale_ok
) {
2437 newtp
->snd_wscale
= req
->snd_wscale
;
2438 newtp
->rcv_wscale
= req
->rcv_wscale
;
2440 newtp
->snd_wscale
= newtp
->rcv_wscale
= 0;
2441 newtp
->window_clamp
= min(newtp
->window_clamp
,65535);
2443 if (newtp
->tstamp_ok
) {
2444 newtp
->ts_recent
= req
->ts_recent
;
2445 newtp
->ts_recent_stamp
= xtime
.tv_sec
;
2446 newtp
->tcp_header_len
= sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
;
2448 newtp
->ts_recent_stamp
= 0;
2449 newtp
->tcp_header_len
= sizeof(struct tcphdr
);
2451 newtp
->mss_clamp
= req
->mss
;
2456 static __inline__
int tcp_in_window(u32 seq
, u32 end_seq
, u32 s_win
, u32 e_win
)
2460 if (after(end_seq
, s_win
) && before(seq
, e_win
))
2462 return (seq
== e_win
&& seq
== end_seq
);
2467 * Process an incoming packet for SYN_RECV sockets represented
2468 * as an open_request.
2471 struct sock
*tcp_check_req(struct sock
*sk
,struct sk_buff
*skb
,
2472 struct open_request
*req
,
2473 struct open_request
*prev
)
2475 struct tcphdr
*th
= skb
->h
.th
;
2476 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2477 u32 flg
= tcp_flag_word(th
) & (TCP_FLAG_RST
|TCP_FLAG_SYN
|TCP_FLAG_ACK
);
2478 int paws_reject
= 0;
2481 /* If socket has already been created, process
2482 packet in its context.
2484 We fall here only due to race, when packets were enqueued
2485 to backlog of listening socket.
2491 if (th
->doff
> (sizeof(struct tcphdr
)>>2)) {
2493 tcp_parse_options(NULL
, th
, &ttp
, 0);
2495 paws_reject
= ttp
.saw_tstamp
&&
2496 (s32
)(ttp
.rcv_tsval
- req
->ts_recent
) < 0;
2499 /* Check for pure retransmited SYN. */
2500 if (TCP_SKB_CB(skb
)->seq
== req
->rcv_isn
&&
2501 flg
== TCP_FLAG_SYN
&&
2504 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2505 * this case on figure 6 and figure 8, but formal
2506 * protocol description says NOTHING.
2507 * To be more exact, it says that we should send ACK,
2508 * because this segment (at least, if it has no data)
2511 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2512 * describe SYN-RECV state. All the description
2513 * is wrong, we cannot believe to it and should
2514 * rely only on common sense and implementation
2517 * Enforce "SYN-ACK" according to figure 8, figure 6
2518 * of RFC793, fixed by RFC1122.
2520 req
->class->rtx_syn_ack(sk
, req
);
2524 /* Further reproduces section "SEGMENT ARRIVES"
2525 for state SYN-RECEIVED of RFC793.
2526 It is broken, however, it does not work only
2527 when SYNs are crossed, which is impossible in our
2530 But generally, we should (RFC lies!) to accept ACK
2531 from SYNACK both here and in tcp_rcv_state_process().
2532 tcp_rcv_state_process() does not, hence, we do not too.
2534 Note that the case is absolutely generic:
2535 we cannot optimize anything here without
2536 violating protocol. All the checks must be made
2537 before attempt to create socket.
2540 /* RFC793: "first check sequence number". */
2542 if (paws_reject
|| !tcp_in_window(TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
,
2543 req
->rcv_isn
+1, req
->rcv_isn
+1+req
->rcv_wnd
)) {
2544 /* Out of window: send ACK and drop. */
2545 if (!(flg
& TCP_FLAG_RST
))
2546 req
->class->send_ack(skb
, req
);
2550 /* In sequence, PAWS is OK. */
2552 if (ttp
.saw_tstamp
&& !after(TCP_SKB_CB(skb
)->seq
, req
->rcv_isn
+1))
2553 req
->ts_recent
= ttp
.rcv_tsval
;
2555 if (TCP_SKB_CB(skb
)->seq
== req
->rcv_isn
) {
2556 /* Truncate SYN, it is out of window starting
2557 at req->rcv_isn+1. */
2558 flg
&= ~TCP_FLAG_SYN
;
2561 /* RFC793: "second check the RST bit" and
2562 * "fourth, check the SYN bit"
2564 if (flg
& (TCP_FLAG_RST
|TCP_FLAG_SYN
))
2565 goto embryonic_reset
;
2567 /* RFC793: "fifth check the ACK field" */
2569 if (!(flg
& TCP_FLAG_ACK
))
2572 /* Invalid ACK: reset will be sent by listening socket */
2573 if (TCP_SKB_CB(skb
)->ack_seq
!= req
->snt_isn
+1)
2576 /* OK, ACK is valid, create big socket and
2577 feed this segment to it. It will repeat all
2578 the tests. THIS SEGMENT MUST MOVE SOCKET TO
2579 ESTABLISHED STATE. If it will be dropped after
2580 socket is created, wait for troubles.
2582 sk
= tp
->af_specific
->syn_recv_sock(sk
, skb
, req
, NULL
);
2586 tcp_dec_slow_timer(TCP_SLT_SYNACK
);
2591 tcp_synq_unlink(tp
, req
, prev
);
2593 tcp_dec_slow_timer(TCP_SLT_SYNACK
);
2595 net_statistics
.EmbryonicRsts
++;
2596 if (!(flg
& TCP_FLAG_RST
))
2597 req
->class->send_reset(skb
);
2599 req
->class->destructor(req
);
2600 tcp_openreq_free(req
);
2604 static int tcp_rcv_synsent_state_process(struct sock
*sk
, struct sk_buff
*skb
,
2605 struct tcphdr
*th
, unsigned len
)
2607 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2609 tcp_parse_options(sk
, th
, tp
, 0);
2611 #ifdef CONFIG_TCP_TW_RECYCLE
2612 if (tp
->ts_recent_stamp
&& tp
->saw_tstamp
&& !th
->rst
&&
2613 (s32
)(tp
->rcv_tsval
- tp
->ts_recent
) < 0 &&
2614 xtime
.tv_sec
< tp
->ts_recent_stamp
+ PAWS_24DAYS
) {
2615 /* Old duplicate segment. We remember last
2616 ts_recent from this host in timewait bucket.
2618 Actually, we could implement per host cache
2619 to truncate timewait state after RTO. Paranoidal arguments
2620 of rfc1337 are not enough to close this nice possibility.
2622 if (net_ratelimit())
2623 printk(KERN_DEBUG
"TCP: tw recycle, PAWS worked. Good.\n");
2632 * "If the state is SYN-SENT then
2633 * first check the ACK bit
2634 * If the ACK bit is set
2635 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2636 * a reset (unless the RST bit is set, if so drop
2637 * the segment and return)"
2639 * I cite this place to emphasize one essential
2640 * detail, this check is different of one
2641 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2642 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2643 * because we have no previous data sent before SYN.
2646 * We do not send data with SYN, so that RFC-correct
2650 TCP_SKB_CB(skb
)->ack_seq
!= tp
->snd_nxt
)
2653 /* Now ACK is acceptable.
2655 * "If the RST bit is set
2656 * If the ACK was acceptable then signal the user "error:
2657 * connection reset", drop the segment, enter CLOSED state,
2658 * delete TCB, and return."
2667 * "fifth, if neither of the SYN or RST bits is set then
2668 * drop the segment and return."
2677 * "If the SYN bit is on ...
2678 * are acceptable then ...
2679 * (our SYN has been ACKed), change the connection
2680 * state to ESTABLISHED..."
2682 * Do you see? SYN-less ACKs in SYN-SENT state are
2683 * completely ignored.
2685 * The bug causing stalled SYN-SENT sockets
2686 * was here: tcp_ack advanced snd_una and canceled
2687 * retransmit timer, so that bare ACK received
2688 * in SYN-SENT state (even with invalid ack==ISS,
2689 * because tcp_ack check is too weak for SYN-SENT)
2690 * causes moving socket to invalid semi-SYN-SENT,
2691 * semi-ESTABLISHED state and connection hangs.
2693 * There exist buggy stacks, which really send
2694 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2695 * Actually, if this host did not try to get something
2696 * from ftp.inr.ac.ru I'd never find this bug 8)
2700 * I was wrong, I apologize. Bare ACK is valid.
2701 * Actually, RFC793 requires to send such ACK
2702 * in reply to any out of window packet.
2703 * It is wrong, but Linux also does it sometimes.
2707 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
2708 tcp_ack(sk
,th
, TCP_SKB_CB(skb
)->seq
,
2709 TCP_SKB_CB(skb
)->ack_seq
, len
);
2711 /* Ok.. it's good. Set up sequence numbers and
2712 * move to established.
2714 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->seq
+1;
2715 tp
->rcv_wup
= TCP_SKB_CB(skb
)->seq
+1;
2717 /* RFC1323: The window in SYN & SYN/ACK segments is
2720 tp
->snd_wnd
= htons(th
->window
);
2721 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
2722 tp
->snd_wl2
= TCP_SKB_CB(skb
)->ack_seq
;
2723 tp
->fin_seq
= TCP_SKB_CB(skb
)->seq
;
2725 tcp_set_state(sk
, TCP_ESTABLISHED
);
2727 if (tp
->wscale_ok
== 0) {
2728 tp
->snd_wscale
= tp
->rcv_wscale
= 0;
2729 tp
->window_clamp
= min(tp
->window_clamp
,65535);
2732 if (tp
->tstamp_ok
) {
2733 tp
->tcp_header_len
=
2734 sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
;
2736 tp
->tcp_header_len
= sizeof(struct tcphdr
);
2737 if (tp
->saw_tstamp
) {
2738 tp
->ts_recent
= tp
->rcv_tsval
;
2739 tp
->ts_recent_stamp
= xtime
.tv_sec
;
2741 tcp_sync_mss(sk
, tp
->pmtu_cookie
);
2742 tcp_initialize_rcv_mss(sk
);
2743 tcp_init_metrics(sk
);
2745 if (tp
->write_pending
) {
2746 /* Save one ACK. Data will be ready after
2747 * several ticks, if write_pending is set.
2749 * How to make this correctly?
2754 tcp_send_delayed_ack(sk
, tp
->rto
);
2759 tp
->copied_seq
= tp
->rcv_nxt
;
2762 wake_up_interruptible(sk
->sleep
);
2763 sock_wake_async(sk
->socket
, 0);
2768 /* No ACK in the segment */
2772 * "If the RST bit is set
2774 * Otherwise (no ACK) drop the segment and return."
2781 /* We see SYN without ACK. It is attempt of
2782 * simultaneous connect with crossed SYNs.
2784 * The previous version of the code
2785 * checked for "connecting to self"
2786 * here. that check is done now in
2789 * RED-PEN: BTW, it does not. 8)
2791 tcp_set_state(sk
, TCP_SYN_RECV
);
2792 if (tp
->saw_tstamp
) {
2793 tp
->ts_recent
= tp
->rcv_tsval
;
2794 tp
->ts_recent_stamp
= xtime
.tv_sec
;
2797 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->seq
+ 1;
2798 tp
->rcv_wup
= TCP_SKB_CB(skb
)->seq
+ 1;
2800 /* RFC1323: The window in SYN & SYN/ACK segments is
2803 tp
->snd_wnd
= htons(th
->window
);
2804 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
2806 tcp_sync_mss(sk
, tp
->pmtu_cookie
);
2807 tcp_initialize_rcv_mss(sk
);
2809 tcp_send_synack(sk
);
2811 /* Note, we could accept data and URG from this segment.
2812 * There are no obstacles to make this.
2814 * However, if we ignore data in ACKless segments sometimes,
2815 * we have no reasons to accept it sometimes.
2816 * Also, seems the code doing it in step6 of tcp_rcv_state_process
2817 * is not flawless. So, discard packet for sanity.
2818 * Uncomment this return to process the data.
2823 /* "fifth, if neither of the SYN or RST bits is set then
2824 * drop the segment and return."
2834 * This function implements the receiving procedure of RFC 793 for
2835 * all states except ESTABLISHED and TIME_WAIT.
2836 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2837 * address independent.
2840 int tcp_rcv_state_process(struct sock
*sk
, struct sk_buff
*skb
,
2841 struct tcphdr
*th
, unsigned len
)
2843 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2848 switch (sk
->state
) {
2850 /* When state == CLOSED, hash lookup always fails.
2852 * But, there is a back door, the backlog queue.
2853 * If we have a sequence of packets in the backlog
2854 * during __release_sock() which have a sequence such
2856 * packet X causes entry to TCP_CLOSE state
2858 * packet X + N has FIN bit set
2860 * We report a (luckily) harmless error in this case.
2861 * The issue is that backlog queue processing bypasses
2862 * any hash lookups (we know which socket packets are for).
2863 * The correct behavior here is what 2.0.x did, since
2864 * a TCP_CLOSE socket does not exist. Drop the frame
2865 * and send a RST back to the other end.
2868 /* 1. The socket may be moved to TIME-WAIT state.
2869 2. While this socket was locked, another socket
2870 with the same identity could be created.
2873 CONCLUSION: discard and only discard!
2875 Alternative would be relookup and recurse into tcp_v?_rcv
2876 (not *_do_rcv) to work with timewait and listen states
2886 if(tp
->af_specific
->conn_request(sk
, skb
) < 0)
2889 /* Now we have several options: In theory there is
2890 * nothing else in the frame. KA9Q has an option to
2891 * send data with the syn, BSD accepts data with the
2892 * syn up to the [to be] advertised window and
2893 * Solaris 2.1 gives you a protocol error. For now
2894 * we just ignore it, that fits the spec precisely
2895 * and avoids incompatibilities. It would be nice in
2896 * future to drop through and process the data.
2898 * Now that TTCP is starting to be used we ought to
2900 * But, this leaves one open to an easy denial of
2901 * service attack, and SYN cookies can't defend
2902 * against this problem. So, we drop the data
2903 * in the interest of security over speed.
2910 queued
= tcp_rcv_synsent_state_process(sk
, skb
, th
, len
);
2917 /* Parse the tcp_options present on this header.
2918 * By this point we really only expect timestamps.
2919 * Note that this really has to be here and not later for PAWS
2920 * (RFC1323) to work.
2922 if (tcp_fast_parse_options(sk
, th
, tp
) && tp
->saw_tstamp
&&
2923 tcp_paws_discard(tp
, skb
)) {
2928 /* Reset is accepted even if it did not pass PAWS. */
2931 /* The silly FIN test here is necessary to see an advancing ACK in
2932 * retransmitted FIN frames properly. Consider the following sequence:
2934 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
2935 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
2936 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
2937 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
2939 * At this point the connection will deadlock with host1 believing
2940 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2941 * forever. The following fix is from Taral (taral@taral.net).
2943 * RED-PEN. Seems, the above is not true.
2944 * If at least one end is RFC compliant, it will send ACK to
2945 * out of window FIN and, hence, move peer to TIME-WAIT.
2946 * I comment out this line. --ANK
2948 * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
2949 * received in SYN-RECV. The problem is that description of
2950 * segment processing in SYN-RECV state in RFC792 is WRONG.
2951 * Correct check would accept ACK from this SYN-ACK, see
2952 * figures 6 and 8 (fixed by RFC1122). Compare this
2953 * to problem with FIN, they smell similarly. --ANK
2956 /* step 1: check sequence number */
2957 if (!tcp_sequence(tp
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
)
2959 && !(th
->fin
&& TCP_SKB_CB(skb
)->end_seq
== tp
->rcv_nxt
)
2968 /* step 2: check RST bit */
2974 if (tp
->saw_tstamp
) {
2975 tcp_replace_ts_recent(sk
, tp
,
2976 TCP_SKB_CB(skb
)->seq
);
2979 /* step 3: check security and precedence [ignored] */
2983 * Check for a SYN, and ensure it matches the SYN we were
2984 * first sent. We have to handle the rather unusual (but valid)
2985 * sequence that KA9Q derived products may generate of
2990 * SYN|ACK Data + More Data
2991 * .. we must ACK not RST...
2993 * We keep syn_seq as the sequence space occupied by the
2997 if (th
->syn
&& TCP_SKB_CB(skb
)->seq
!= tp
->syn_seq
) {
3002 /* step 5: check the ACK field */
3004 int acceptable
= tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
,
3005 TCP_SKB_CB(skb
)->ack_seq
, len
);
3010 tcp_set_state(sk
, TCP_ESTABLISHED
);
3011 tp
->copied_seq
= tp
->rcv_nxt
;
3013 /* Note, that this wakeup is only for marginal
3014 crossed SYN case. Passively open sockets
3015 are not waked up, because sk->sleep == NULL
3016 and sk->socket == NULL.
3018 if (!sk
->dead
&& sk
->sleep
) {
3019 wake_up_interruptible(sk
->sleep
);
3020 sock_wake_async(sk
->socket
, 1);
3023 tp
->snd_una
= TCP_SKB_CB(skb
)->ack_seq
;
3024 tp
->snd_wnd
= htons(th
->window
) << tp
->snd_wscale
;
3025 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
3026 tp
->snd_wl2
= TCP_SKB_CB(skb
)->ack_seq
;
3028 /* tcp_ack considers this ACK as duplicate
3029 * and does not calculate rtt. It is wrong.
3030 * Fix it at least with timestamps.
3032 if (tp
->saw_tstamp
&& !tp
->srtt
)
3033 tcp_ack_saw_tstamp(sk
, tp
, 0, 0, FLAG_SYN_ACKED
);
3035 tcp_init_metrics(sk
);
3037 SOCK_DEBUG(sk
, "bad ack\n");
3043 if (tp
->snd_una
== tp
->write_seq
) {
3044 sk
->shutdown
|= SEND_SHUTDOWN
;
3045 tcp_set_state(sk
, TCP_FIN_WAIT2
);
3047 sk
->state_change(sk
);
3049 tcp_reset_keepalive_timer(sk
, sysctl_tcp_fin_timeout
);
3050 dst_confirm(sk
->dst_cache
);
3055 if (tp
->snd_una
== tp
->write_seq
) {
3062 if (tp
->snd_una
== tp
->write_seq
) {
3063 tcp_set_state(sk
,TCP_CLOSE
);
3064 tcp_update_metrics(sk
);
3074 /* step 6: check the URG bit */
3075 tcp_urg(sk
, th
, len
);
3077 /* step 7: process the segment text */
3078 switch (sk
->state
) {
3079 case TCP_CLOSE_WAIT
:
3081 if (!before(TCP_SKB_CB(skb
)->seq
, tp
->fin_seq
))
3086 /* RFC 793 says to queue data in these states,
3087 * RFC 1122 says we MUST send a reset.
3088 * BSD 4.4 also does reset.
3090 if ((sk
->shutdown
& RCV_SHUTDOWN
) && sk
->dead
) {
3091 if (after(TCP_SKB_CB(skb
)->end_seq
- th
->fin
, tp
->rcv_nxt
)) {
3097 case TCP_ESTABLISHED
:
3098 queued
= tcp_data(skb
, sk
, len
);
3100 /* This must be after tcp_data() does the skb_pull() to
3101 * remove the header size from skb->len.
3103 tcp_measure_rcv_mss(sk
, skb
);
3107 /* tcp_data could move socket to TIME-WAIT */
3108 if (sk
->state
!= TCP_CLOSE
) {
3109 tcp_data_snd_check(sk
);
3110 tcp_ack_snd_check(sk
);