2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.193 2000/04/20 14:41:16 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
25 * Pedro Roque : Fast Retransmit/Recovery.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
60 #include <linux/config.h>
62 #include <linux/sysctl.h>
64 #include <net/inet_common.h>
65 #include <linux/ipsec.h>
68 #define SYNC_INIT 0 /* let the user enable it */
73 /* These are on by default so the code paths get tested.
74 * For the final 2.2 this may be undone at our discretion. -DaveM
76 int sysctl_tcp_timestamps
= 1;
77 int sysctl_tcp_window_scaling
= 1;
78 int sysctl_tcp_sack
= 1;
80 int sysctl_tcp_syncookies
= SYNC_INIT
;
81 int sysctl_tcp_stdurg
;
82 int sysctl_tcp_rfc1337
;
83 int sysctl_tcp_tw_recycle
= 1;
84 int sysctl_tcp_abort_on_overflow
= 0;
85 int sysctl_tcp_max_orphans
= NR_FILE
;
86 int sysctl_tcp_max_tw_buckets
= NR_FILE
*2;
88 static int prune_queue(struct sock
*sk
);
91 * Adapt the MSS value used to make delayed ack decision to the
94 * The constant 536 hasn't any good meaning. In IPv4 world
95 * MTU may be smaller, though it contradicts to RFC1122, which
96 * states that MSS must be at least 536.
97 * We use the constant to do not ACK each second
98 * packet in a stream of tiny size packets.
99 * It means that super-low mtu links will be aggressively delacked.
100 * Seems, it is even good. If they have so low mtu, they are weirdly
103 * AK: BTW it may be useful to add an option to lock the rcv_mss.
104 * this way the beowulf people wouldn't need ugly patches to get the
105 * ack frequencies they want and it would be an elegant way to tune delack.
107 static __inline__
void tcp_measure_rcv_mss(struct tcp_opt
*tp
, struct sk_buff
*skb
)
109 unsigned int len
, lss
;
111 lss
= tp
->ack
.last_seg_size
;
112 tp
->ack
.last_seg_size
= 0;
114 /* skb->len may jitter because of SACKs, even if peer
115 * sends good full-sized frames.
118 if (len
>= tp
->ack
.rcv_mss
) {
119 tp
->ack
.rcv_mss
= len
;
121 /* Otherwise, we make more careful check taking into account,
122 * that SACKs block is variable.
124 * "len" is invariant segment length, including TCP header.
126 len
= skb
->tail
- skb
->h
.raw
;
127 if (len
>= TCP_MIN_RCVMSS
+ sizeof(struct tcphdr
)) {
128 /* Subtract also invariant (if peer is RFC compliant),
129 * tcp header plus fixed timestamp option length.
130 * Resulting "len" is MSS free of SACK jitter.
132 len
-= tp
->tcp_header_len
;
134 tp
->ack
.rcv_mss
= len
;
135 tp
->ack
.last_seg_size
= len
;
141 static __inline__
void tcp_enter_quickack_mode(struct tcp_opt
*tp
)
143 unsigned quickacks
= tcp_receive_window(tp
)/(2*tp
->ack
.rcv_mss
);
145 tp
->ack
.quick
= max(min(quickacks
, 127), 1);
147 if (!tp
->tstamp_ok
&& tp
->ack
.quick
>2) {
148 /* Quick ACKs are _dangerous_, if RTTM is not used.
149 * See comment in tcp_init_metrics(). We still help
150 * them to overcome the most difficult, initial
151 * phase of slow start.
157 /* Send ACKs quickly, if "quick" count is not ehausted
158 * and the session is not interactive.
161 static __inline__
int tcp_in_quickack_mode(struct tcp_opt
*tp
)
163 return (tp
->ack
.quick
&& !tp
->ack
.pingpong
);
166 /* There is something which you must keep in mind when you analyze the
167 * behavior of the tp->ato delayed ack timeout interval. When a
168 * connection starts up, we want to ack as quickly as possible. The
169 * problem is that "good" TCP's do slow start at the beginning of data
170 * transmission. The means that until we send the first few ACK's the
171 * sender will sit on his end and only queue most of his data, because
172 * he can only send snd_cwnd unacked packets at any given time. For
173 * each ACK we send, he increments snd_cwnd and transmits more of his
176 static void tcp_event_data_recv(struct tcp_opt
*tp
, struct sk_buff
*skb
)
180 tcp_measure_rcv_mss(tp
, skb
);
185 now
= tcp_time_stamp
;
188 /* The _first_ data packet received, initialize
189 * delayed ACK engine.
192 /* Help sender leave slow start quickly. */
193 tcp_enter_quickack_mode(tp
);
195 /* Pingpong is off, session is not interactive by default */
196 tp
->ack
.pingpong
= 0;
199 tp
->ack
.ato
= TCP_ATO_MIN
;
201 int m
= now
- tp
->ack
.lrcvtime
;
203 if (m
> TCP_ATO_MAX
/2) {
204 /* Do not touch ATO, if interval is out of bounds.
205 * It will be deflated by delack timer, if our peer
206 * really sends too rarely.
209 /* Too long gap. Apparently sender falled to
210 * restart window, so that we send ACKs quickly.
212 tcp_enter_quickack_mode(tp
);
217 if (m
<= tp
->ack
.ato
)
218 tp
->ack
.ato
= (tp
->ack
.ato
>> 1) + m
;
221 tp
->ack
.lrcvtime
= now
;
224 /* Called to compute a smoothed rtt estimate. The data fed to this
225 * routine either comes from timestamps, or from segments that were
226 * known _not_ to have been retransmitted [see Karn/Partridge
227 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
228 * piece by Van Jacobson.
229 * NOTE: the next three routines used to be one big routine.
230 * To save cycles in the RFC 1323 implementation it was better to break
231 * it up into three procedures. -- erics
234 static __inline__
void tcp_rtt_estimator(struct tcp_opt
*tp
, __u32 mrtt
)
236 long m
= mrtt
; /* RTT */
238 /* The following amusing code comes from Jacobson's
239 * article in SIGCOMM '88. Note that rtt and mdev
240 * are scaled versions of rtt and mean deviation.
241 * This is designed to be as fast as possible
242 * m stands for "measurement".
244 * On a 1990 paper the rto value is changed to:
245 * RTO = rtt + 4 * mdev
250 m
-= (tp
->srtt
>> 3); /* m is now error in rtt est */
251 tp
->srtt
+= m
; /* rtt = 7/8 rtt + 1/8 new */
253 m
= -m
; /* m is now abs(error) */
254 m
-= (tp
->mdev
>> 2); /* similar update on mdev */
255 tp
->mdev
+= m
; /* mdev = 3/4 mdev + 1/4 new */
257 /* no previous measure. */
258 tp
->srtt
= m
<<3; /* take the measured time to be rtt */
259 tp
->mdev
= m
<<2; /* make sure rto = 3*rtt */
263 /* Calculate rto without backoff. This is the second half of Van Jacobson's
264 * routine referred to above.
267 static __inline__
void tcp_set_rto(struct tcp_opt
*tp
)
269 tp
->rto
= (tp
->srtt
>> 3) + tp
->mdev
;
270 /* I am not enough educated to understand this magic.
271 * However, it smells bad. snd_cwnd>31 is common case.
273 tp
->rto
+= (tp
->rto
>> 2) + (tp
->rto
>> (tp
->snd_cwnd
-1));
277 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
278 * on packet lifetime in the internet. We need the HZ/5 lower
279 * bound to behave correctly against BSD stacks with a fixed
281 * FIXME: It's not entirely clear this lower bound is the best
282 * way to avoid the problem. Is it possible to drop the lower
283 * bound and still avoid trouble with BSD stacks? Perhaps
284 * some modification to the RTO calculation that takes delayed
285 * ack bias into account? This needs serious thought. -- erics
287 static __inline__
void tcp_bound_rto(struct tcp_opt
*tp
)
289 if (tp
->rto
< TCP_RTO_MIN
)
290 tp
->rto
= TCP_RTO_MIN
;
291 else if (tp
->rto
> TCP_RTO_MAX
)
292 tp
->rto
= TCP_RTO_MAX
;
295 /* Save metrics learned by this TCP session.
296 This function is called only, when TCP finishes sucessfully
297 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
299 static void tcp_update_metrics(struct sock
*sk
)
301 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
302 struct dst_entry
*dst
= __sk_dst_get(sk
);
306 if (dst
&& (dst
->flags
&DST_HOST
)) {
309 if (tp
->backoff
|| !tp
->srtt
) {
310 /* This session failed to estimate rtt. Why?
311 * Probably, no packets returned in time.
314 if (!(dst
->mxlock
&(1<<RTAX_RTT
)))
319 m
= dst
->rtt
- tp
->srtt
;
321 /* If newly calculated rtt larger than stored one,
322 * store new one. Otherwise, use EWMA. Remember,
323 * rtt overestimation is always better than underestimation.
325 if (!(dst
->mxlock
&(1<<RTAX_RTT
))) {
332 if (!(dst
->mxlock
&(1<<RTAX_RTTVAR
))) {
336 /* Scale deviation to rttvar fixed point */
341 if (m
>= dst
->rttvar
)
344 dst
->rttvar
-= (dst
->rttvar
- m
)>>2;
347 if (tp
->snd_ssthresh
== 0x7FFFFFFF) {
348 /* Slow start still did not finish. */
350 !(dst
->mxlock
&(1<<RTAX_SSTHRESH
)) &&
351 tp
->snd_cwnd
> dst
->ssthresh
)
352 dst
->ssthresh
= tp
->snd_cwnd
;
353 if (!(dst
->mxlock
&(1<<RTAX_CWND
)) &&
354 tp
->snd_cwnd
> dst
->cwnd
)
355 dst
->cwnd
= tp
->snd_cwnd
;
356 } else if (tp
->snd_cwnd
>= tp
->snd_ssthresh
&& !tp
->high_seq
) {
357 /* Cong. avoidance phase, cwnd is reliable. */
358 if (!(dst
->mxlock
&(1<<RTAX_SSTHRESH
)))
359 dst
->ssthresh
= tp
->snd_cwnd
;
360 if (!(dst
->mxlock
&(1<<RTAX_CWND
)))
361 dst
->cwnd
= (dst
->cwnd
+ tp
->snd_cwnd
)>>1;
363 /* Else slow start did not finish, cwnd is non-sense,
364 ssthresh may be also invalid.
366 if (!(dst
->mxlock
&(1<<RTAX_CWND
)))
367 dst
->cwnd
= (dst
->cwnd
+ tp
->snd_ssthresh
)>>1;
369 !(dst
->mxlock
&(1<<RTAX_SSTHRESH
)) &&
370 tp
->snd_ssthresh
> dst
->ssthresh
)
371 dst
->ssthresh
= tp
->snd_ssthresh
;
376 /* Initialize metrics on socket. */
378 static void tcp_init_metrics(struct sock
*sk
)
380 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
381 struct dst_entry
*dst
= __sk_dst_get(sk
);
388 if (dst
->mxlock
&(1<<RTAX_CWND
))
389 tp
->snd_cwnd_clamp
= dst
->cwnd
;
391 tp
->snd_ssthresh
= dst
->ssthresh
;
392 if (tp
->snd_ssthresh
> tp
->snd_cwnd_clamp
)
393 tp
->snd_ssthresh
= tp
->snd_cwnd_clamp
;
399 if (!tp
->srtt
&& dst
->rtt
< (TCP_TIMEOUT_INIT
<<3))
402 /* Initial rtt is determined from SYN,SYN-ACK.
403 * The segment is small and rtt may appear much
404 * less than real one. Use per-dst memory
405 * to make it more realistic.
407 * A bit of theory. RTT is time passed after "normal" sized packet
408 * is sent until it is ACKed. In normal curcumstances sending small
409 * packets force peer to delay ACKs and calculation is correct too.
410 * The algorithm is adaptive and, provided we follow specs, it
411 * NEVER underestimate RTT. BUT! If peer tries to make some clever
412 * tricks sort of "quick acks" for time long enough to decrease RTT
413 * to low value, and then abruptly stops to do it and starts to delay
414 * ACKs, wait for troubles.
416 if (dst
->rtt
> tp
->srtt
)
418 if (dst
->rttvar
> tp
->mdev
)
419 tp
->mdev
= dst
->rttvar
;
422 if (tp
->rto
< TCP_TIMEOUT_INIT
&& !tp
->saw_tstamp
)
424 tp
->snd_cwnd
= tcp_init_cwnd(tp
);
429 /* Play conservative. If timestamps are not
430 * supported, TCP will fail to recalculate correct
431 * rtt, if initial rto is too small. FORGET ALL AND RESET!
433 if (!tp
->saw_tstamp
&& tp
->srtt
) {
435 tp
->mdev
= TCP_TIMEOUT_INIT
;
436 tp
->rto
= TCP_TIMEOUT_INIT
;
440 /* WARNING: this must not be called if tp->saw_tstamp was false. */
441 extern __inline__
void
442 tcp_replace_ts_recent(struct sock
*sk
, struct tcp_opt
*tp
, u32 seq
)
444 if (!after(seq
, tp
->rcv_wup
)) {
445 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
446 * extra check below makes sure this can only happen
447 * for pure ACK frames. -DaveM
449 * Not only, also it occurs for expired timestamps
450 * and RSTs with bad timestamp option. --ANK
453 if((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) >= 0 ||
454 xtime
.tv_sec
>= tp
->ts_recent_stamp
+ TCP_PAWS_24DAYS
) {
455 tp
->ts_recent
= tp
->rcv_tsval
;
456 tp
->ts_recent_stamp
= xtime
.tv_sec
;
461 extern __inline__
int tcp_paws_discard(struct tcp_opt
*tp
, struct sk_buff
*skb
)
463 return ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) < 0 &&
464 xtime
.tv_sec
< tp
->ts_recent_stamp
+ TCP_PAWS_24DAYS
466 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
468 I cannot see quitely as all the idea behind PAWS
471 The problem is only in reordering duplicate ACKs.
472 Hence, we can check this rare case more carefully.
474 1. Check that it is really duplicate ACK (ack==snd_una)
475 2. Give it some small "replay" window (~RTO)
477 We do not know units of foreign ts values, but make conservative
478 assumption that they are >=1ms. It solves problem
479 noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
481 && (TCP_SKB_CB(skb
)->seq
!= TCP_SKB_CB(skb
)->end_seq
||
482 TCP_SKB_CB(skb
)->ack_seq
!= tp
->snd_una
||
484 (s32
)(tp
->ts_recent
- tp
->rcv_tsval
) > (tp
->rto
*1024)/HZ
));
488 static int __tcp_sequence(struct tcp_opt
*tp
, u32 seq
, u32 end_seq
)
490 u32 end_window
= tp
->rcv_wup
+ tp
->rcv_wnd
;
491 #ifdef TCP_FORMAL_WINDOW
492 u32 rcv_wnd
= tcp_receive_window(tp
);
494 u32 rcv_wnd
= tp
->rcv_wnd
;
498 after(end_seq
, tp
->rcv_nxt
) &&
499 before(seq
, end_window
))
501 if (seq
!= end_window
)
503 return (seq
== end_seq
);
506 /* This functions checks to see if the tcp header is actually acceptable. */
507 extern __inline__
int tcp_sequence(struct tcp_opt
*tp
, u32 seq
, u32 end_seq
)
509 #ifdef TCP_FORMAL_WINDOW
510 u32 rcv_wnd
= tcp_receive_window(tp
);
512 u32 rcv_wnd
= tp
->rcv_wnd
;
514 if (seq
== tp
->rcv_nxt
)
515 return (rcv_wnd
|| (end_seq
== seq
));
517 return __tcp_sequence(tp
, seq
, end_seq
);
520 /* When we get a reset we do this. */
521 static void tcp_reset(struct sock
*sk
)
523 /* We want the right error as BSD sees it (and indeed as we do). */
526 sk
->err
= ECONNREFUSED
;
534 sk
->err
= ECONNRESET
;
538 sk
->error_report(sk
);
543 /* This tags the retransmission queue when SACKs arrive. */
544 static void tcp_sacktag_write_queue(struct sock
*sk
, struct tcp_sack_block
*sp
, int nsacks
)
546 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
550 struct sk_buff
*skb
= skb_peek(&sk
->write_queue
);
551 __u32 start_seq
= ntohl(sp
->start_seq
);
552 __u32 end_seq
= ntohl(sp
->end_seq
);
555 while((skb
!= NULL
) &&
556 (skb
!= tp
->send_head
) &&
557 (skb
!= (struct sk_buff
*)&sk
->write_queue
)) {
558 /* The retransmission queue is always in order, so
559 * we can short-circuit the walk early.
561 if(after(TCP_SKB_CB(skb
)->seq
, end_seq
))
564 /* We play conservative, we don't allow SACKS to partially
565 * tag a sequence space.
568 if(!after(start_seq
, TCP_SKB_CB(skb
)->seq
) &&
569 !before(end_seq
, TCP_SKB_CB(skb
)->end_seq
)) {
570 /* If this was a retransmitted frame, account for it. */
571 if((TCP_SKB_CB(skb
)->sacked
& TCPCB_SACKED_RETRANS
) &&
574 TCP_SKB_CB(skb
)->sacked
|= TCPCB_SACKED_ACKED
;
576 /* RULE: All new SACKs will either decrease retrans_out
577 * or advance fackets_out.
579 if(fack_count
> tp
->fackets_out
)
580 tp
->fackets_out
= fack_count
;
584 sp
++; /* Move on to the next SACK block. */
588 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
589 * But, this can also be called on packets in the established flow when
590 * the fast version below fails.
592 void tcp_parse_options(struct sock
*sk
, struct tcphdr
*th
, struct tcp_opt
*tp
, int no_fancy
)
595 int length
=(th
->doff
*4)-sizeof(struct tcphdr
);
597 ptr
= (unsigned char *)(th
+ 1);
607 case TCPOPT_NOP
: /* Ref: RFC 793 section 3.1 */
612 if (opsize
< 2) /* "silly options" */
615 break; /* don't parse partial options */
618 if(opsize
==TCPOLEN_MSS
&& th
->syn
) {
619 u16 in_mss
= ntohs(*(__u16
*)ptr
);
621 if (tp
->user_mss
&& tp
->user_mss
< in_mss
)
622 in_mss
= tp
->user_mss
;
623 tp
->mss_clamp
= in_mss
;
628 if(opsize
==TCPOLEN_WINDOW
&& th
->syn
)
629 if (!no_fancy
&& sysctl_tcp_window_scaling
) {
631 tp
->snd_wscale
= *(__u8
*)ptr
;
632 if(tp
->snd_wscale
> 14) {
634 printk("tcp_parse_options: Illegal window "
635 "scaling value %d >14 received.",
641 case TCPOPT_TIMESTAMP
:
642 if(opsize
==TCPOLEN_TIMESTAMP
) {
643 if (sysctl_tcp_timestamps
&& !no_fancy
) {
646 tp
->rcv_tsval
= ntohl(*(__u32
*)ptr
);
647 tp
->rcv_tsecr
= ntohl(*(__u32
*)(ptr
+4));
651 case TCPOPT_SACK_PERM
:
652 if(opsize
==TCPOLEN_SACK_PERM
&& th
->syn
) {
653 if (sysctl_tcp_sack
&& !no_fancy
) {
661 if((opsize
>= (TCPOLEN_SACK_BASE
+ TCPOLEN_SACK_PERBLOCK
)) &&
662 sysctl_tcp_sack
&& (sk
!= NULL
) && !th
->syn
) {
663 int sack_bytes
= opsize
- TCPOLEN_SACK_BASE
;
665 if(!(sack_bytes
% TCPOLEN_SACK_PERBLOCK
)) {
666 int num_sacks
= sack_bytes
>> 3;
667 struct tcp_sack_block
*sackp
;
669 sackp
= (struct tcp_sack_block
*)ptr
;
670 tcp_sacktag_write_queue(sk
, sackp
, num_sacks
);
680 /* Fast parse options. This hopes to only see timestamps.
681 * If it is wrong it falls back on tcp_parse_options().
683 static __inline__
int tcp_fast_parse_options(struct sock
*sk
, struct tcphdr
*th
, struct tcp_opt
*tp
)
685 /* If we didn't send out any options ignore them all. */
686 if (tp
->tcp_header_len
== sizeof(struct tcphdr
))
688 if (th
->doff
== sizeof(struct tcphdr
)>>2) {
691 } else if (th
->doff
== (sizeof(struct tcphdr
)>>2)+(TCPOLEN_TSTAMP_ALIGNED
>>2)) {
692 __u32
*ptr
= (__u32
*)(th
+ 1);
693 if (*ptr
== __constant_ntohl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
694 | (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
)) {
697 tp
->rcv_tsval
= ntohl(*ptr
);
699 tp
->rcv_tsecr
= ntohl(*ptr
);
703 tcp_parse_options(sk
, th
, tp
, 0);
707 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
708 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
709 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
710 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
711 #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
713 static __inline__
void clear_fast_retransmit(struct tcp_opt
*tp
)
715 if (tp
->dup_acks
> 3)
716 tp
->snd_cwnd
= (tp
->snd_ssthresh
);
721 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
722 * retransmit timer fires.
724 static void tcp_fast_retrans(struct sock
*sk
, u32 ack
, int not_dup
)
726 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
728 /* Note: If not_dup is set this implies we got a
729 * data carrying packet or a window update.
730 * This carries no new information about possible
731 * lost packets, so we have to ignore it for the purposes
732 * of counting duplicate acks. Ideally this does not imply we
733 * should stop our fast retransmit phase, more acks may come
734 * later without data to help us. Unfortunately this would make
735 * the code below much more complex. For now if I see such
736 * a packet I clear the fast retransmit phase.
738 if (ack
== tp
->snd_una
&& tp
->packets_out
&& (not_dup
== 0)) {
739 /* This is the standard reno style fast retransmit branch. */
741 /* 1. When the third duplicate ack is received, set ssthresh
742 * to one half the current congestion window, but no less
743 * than two segments. Retransmit the missing segment.
745 if (tp
->high_seq
== 0 || after(ack
, tp
->high_seq
)) {
747 if ((tp
->fackets_out
> 3) || (tp
->dup_acks
== 3)) {
748 __tcp_enter_cong_avoid(tp
);
749 /* ... and account for 3 ACKs, which are
750 * already received to this time.
755 tcp_retransmit_skb(sk
,
756 skb_peek(&sk
->write_queue
));
758 tcp_fack_retransmit(sk
);
759 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
761 } else if (++tp
->dup_acks
> 3) {
762 /* 2. Each time another duplicate ACK arrives, increment
763 * cwnd by the segment size. [...] Transmit a packet...
765 * Packet transmission will be done on normal flow processing
766 * since we're not in "retransmit mode". We do not use
767 * duplicate ACKs to artificially inflate the congestion
768 * window when doing FACK.
770 if(!tp
->fackets_out
) {
773 /* Fill any further holes which may have
776 * We may want to change this to run every
777 * further multiple-of-3 dup ack increments,
778 * to be more robust against out-of-order
779 * packet delivery. -DaveM
781 tcp_fack_retransmit(sk
);
784 } else if (tp
->high_seq
!= 0) {
785 /* In this branch we deal with clearing the Floyd style
786 * block on duplicate fast retransmits, and if requested
787 * we do Hoe style secondary fast retransmits.
789 if (!before(ack
, tp
->high_seq
) || (not_dup
& FLAG_DATA
) != 0) {
790 /* Once we have acked all the packets up to high_seq
791 * we are done this fast retransmit phase.
792 * Alternatively data arrived. In this case we
793 * Have to abort the fast retransmit attempt.
794 * Note that we do want to accept a window
795 * update since this is expected with Hoe's algorithm.
797 clear_fast_retransmit(tp
);
799 /* After we have cleared up to high_seq we can
800 * clear the Floyd style block.
802 if (!before(ack
, tp
->high_seq
)) {
806 } else if (tp
->dup_acks
>= 3) {
807 if (!tp
->fackets_out
) {
808 /* Hoe Style. We didn't ack the whole
809 * window. Take this as a cue that
810 * another packet was lost and retransmit it.
811 * Don't muck with the congestion window here.
812 * Note that we have to be careful not to
813 * act if this was a window update and it
814 * didn't ack new data, since this does
815 * not indicate a packet left the system.
816 * We can test this by just checking
817 * if ack changed from snd_una, since
818 * the only way to get here without advancing
819 * from snd_una is if this was a window update.
821 if (ack
!= tp
->snd_una
&& before(ack
, tp
->high_seq
)) {
822 tcp_retransmit_skb(sk
,
823 skb_peek(&sk
->write_queue
));
824 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
827 /* FACK style, fill any remaining holes in
830 tcp_fack_retransmit(sk
);
836 /* This is Jacobson's slow start and congestion avoidance.
837 * SIGCOMM '88, p. 328.
839 static __inline__
void tcp_cong_avoid(struct tcp_opt
*tp
)
841 if (tp
->snd_cwnd
<= tp
->snd_ssthresh
) {
842 /* In "safe" area, increase. */
843 if (tp
->snd_cwnd
< tp
->snd_cwnd_clamp
)
846 /* In dangerous area, increase slowly.
847 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
849 if (tp
->snd_cwnd_cnt
>= tp
->snd_cwnd
) {
850 if (tp
->snd_cwnd
< tp
->snd_cwnd_clamp
)
858 /* Remove acknowledged frames from the retransmission queue. */
859 static int tcp_clean_rtx_queue(struct sock
*sk
, __u32 ack
,
860 __u32
*seq
, __u32
*seq_rtt
)
862 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
864 __u32 now
= tcp_time_stamp
;
867 /* If we are retransmitting, and this ACK clears up to
868 * the retransmit head, or further, then clear our state.
870 if (tp
->retrans_head
!= NULL
&&
871 !before(ack
, TCP_SKB_CB(tp
->retrans_head
)->end_seq
))
872 tp
->retrans_head
= NULL
;
874 while((skb
=skb_peek(&sk
->write_queue
)) && (skb
!= tp
->send_head
)) {
875 struct tcp_skb_cb
*scb
= TCP_SKB_CB(skb
);
876 __u8 sacked
= scb
->sacked
;
878 /* If our packet is before the ack sequence we can
879 * discard it as it's confirmed to have arrived at
882 if (after(scb
->end_seq
, ack
))
885 /* Initial outgoing SYN's get put onto the write_queue
886 * just like anything else we transmit. It is not
887 * true data, and if we misinform our callers that
888 * this ACK acks real data, we will erroneously exit
889 * connection startup slow start one packet too
890 * quickly. This is severely frowned upon behavior.
892 if((sacked
& TCPCB_SACKED_RETRANS
) && tp
->retrans_out
)
894 if(!(scb
->flags
& TCPCB_FLAG_SYN
)) {
895 acked
|= FLAG_DATA_ACKED
;
896 if(sacked
& TCPCB_SACKED_RETRANS
)
897 acked
|= FLAG_RETRANS_DATA_ACKED
;
901 acked
|= FLAG_SYN_ACKED
;
902 /* This is pure paranoia. */
903 tp
->retrans_head
= NULL
;
907 *seq_rtt
= now
- scb
->when
;
908 __skb_unlink(skb
, skb
->list
);
914 static void tcp_ack_probe(struct sock
*sk
, __u32 ack
)
916 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
918 /* Was it a usable window open? */
920 if (tp
->send_head
!= NULL
) {
921 if (!after(TCP_SKB_CB(tp
->send_head
)->end_seq
, ack
+ tp
->snd_wnd
)) {
923 tcp_clear_xmit_timer(sk
, TCP_TIME_PROBE0
);
924 /* If packets_out==0, socket must be waked up by
925 * subsequent tcp_data_snd_check(). This function is
926 * not for random using!
928 } else if (!tp
->packets_out
) {
929 tcp_reset_xmit_timer(sk
, TCP_TIME_PROBE0
,
930 min(tp
->rto
<< tp
->backoff
, TCP_RTO_MAX
));
935 /* Should we open up the congestion window? */
936 static __inline__
int should_advance_cwnd(struct tcp_opt
*tp
, int flag
)
938 /* Data must have been acked. */
939 if ((flag
& FLAG_DATA_ACKED
) == 0)
942 /* Some of the data acked was retransmitted somehow? */
943 if ((flag
& FLAG_RETRANS_DATA_ACKED
) != 0) {
944 /* We advance in all cases except during
945 * non-FACK fast retransmit/recovery.
947 if (tp
->fackets_out
!= 0 ||
948 tp
->retransmits
!= 0)
951 /* Non-FACK fast retransmit does it's own
952 * congestion window management, don't get
958 /* New non-retransmitted data acked, always advance. */
962 /* Read draft-ietf-tcplw-high-performance before mucking
963 * with this code. (Superceeds RFC1323)
965 static void tcp_ack_saw_tstamp(struct sock
*sk
, struct tcp_opt
*tp
,
966 u32 seq
, u32 ack
, int flag
)
970 /* RTTM Rule: A TSecr value received in a segment is used to
971 * update the averaged RTT measurement only if the segment
972 * acknowledges some new data, i.e., only if it advances the
973 * left edge of the send window.
975 * See draft-ietf-tcplw-high-performance-00, section 3.3.
976 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
978 if (!(flag
& (FLAG_DATA_ACKED
|FLAG_SYN_ACKED
)))
981 seq_rtt
= tcp_time_stamp
- tp
->rcv_tsecr
;
982 tcp_rtt_estimator(tp
, seq_rtt
);
983 if (tp
->retransmits
) {
984 if (tp
->packets_out
== 0) {
991 /* Still retransmitting, use backoff */
993 tp
->rto
= tp
->rto
<< tp
->backoff
;
1002 static __inline__
void tcp_ack_packets_out(struct sock
*sk
, struct tcp_opt
*tp
)
1004 struct sk_buff
*skb
= skb_peek(&sk
->write_queue
);
1007 /* It occured in 2.3, because of racy timers. Namely,
1008 * retransmit timer did not check packets_out and retransmitted
1009 * send_head sometimes and, hence, messed all the write_queue.
1010 * Now it is impossible, I bet. --ANK
1013 printk("Sucks! packets_out=%d, sk=%p, %d\n", tp
->packets_out
, sk
, sk
->state
);
1018 /* Some data was ACK'd, if still retransmitting (due to a
1019 * timeout), resend more of the retransmit queue. The
1020 * congestion window is handled properly by that code.
1022 if (tp
->retransmits
) {
1023 tcp_xmit_retransmit_queue(sk
);
1024 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
1026 __u32 when
= tp
->rto
- (tcp_time_stamp
- TCP_SKB_CB(skb
)->when
);
1027 if ((__s32
)when
< 0)
1029 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, when
);
1033 /* This routine deals with incoming acks, but not outgoing ones. */
1034 static int tcp_ack(struct sock
*sk
, struct tcphdr
*th
,
1035 u32 ack_seq
, u32 ack
, int len
)
1037 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1042 if(sk
->state
== TCP_CLOSE
)
1043 return 1; /* Dead, can't ack any more so why bother */
1045 /* If the ack is newer than sent or older than previous acks
1046 * then we can probably ignore it.
1048 if (after(ack
, tp
->snd_nxt
) || before(ack
, tp
->snd_una
))
1049 goto uninteresting_ack
;
1051 /* If there is data set flag 1 */
1052 if (len
!= th
->doff
*4)
1055 /* Update our send window. */
1057 /* This is the window update code as per RFC 793
1058 * snd_wl{1,2} are used to prevent unordered
1059 * segments from shrinking the window
1061 if (before(tp
->snd_wl1
, ack_seq
) ||
1062 (tp
->snd_wl1
== ack_seq
&& !after(tp
->snd_wl2
, ack
))) {
1063 u32 nwin
= ntohs(th
->window
) << tp
->snd_wscale
;
1065 if ((tp
->snd_wl2
!= ack
) || (nwin
> tp
->snd_wnd
)) {
1066 flag
|= FLAG_WIN_UPDATE
;
1067 if (tp
->snd_wnd
!= nwin
) {
1070 /* Note, it is the only place, where
1071 * fast path is recovered for sending TCP.
1073 if (skb_queue_len(&tp
->out_of_order_queue
) == 0 &&
1074 #ifdef TCP_FORMAL_WINDOW
1075 tcp_receive_window(tp
) &&
1078 tcp_fast_path_on(tp
);
1080 if (nwin
> tp
->max_window
) {
1081 tp
->max_window
= nwin
;
1082 tcp_sync_mss(sk
, tp
->pmtu_cookie
);
1086 tp
->snd_wl1
= ack_seq
;
1091 /* BEWARE! From this place and until return from this function
1092 * snd_nxt and snd_wnd are out of sync. All the routines, called
1093 * from here must get "ack" as argument or they should not depend
1094 * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
1097 /* We passed data and got it acked, remove any soft error
1098 * log. Something worked...
1102 tp
->rcv_tstamp
= tcp_time_stamp
;
1104 /* See if we can take anything off of the retransmit queue. */
1105 flag
|= tcp_clean_rtx_queue(sk
, ack
, &seq
, &seq_rtt
);
1107 /* If this ack opens up a zero window, clear backoff. It was
1108 * being used to time the probes, and is probably far higher than
1109 * it needs to be for normal retransmission.
1111 if (tcp_timer_is_set(sk
, TCP_TIME_PROBE0
))
1112 tcp_ack_probe(sk
, ack
);
1114 /* We must do this here, before code below clears out important
1115 * state contained in tp->fackets_out and tp->retransmits. -DaveM
1117 if (should_advance_cwnd(tp
, flag
))
1120 /* If we have a timestamp, we always do rtt estimates. */
1121 if (tp
->saw_tstamp
) {
1122 tcp_ack_saw_tstamp(sk
, tp
, seq
, ack
, flag
);
1124 /* If we were retransmiting don't count rtt estimate. */
1125 if (tp
->retransmits
) {
1126 if (tp
->packets_out
== 0) {
1127 tp
->retransmits
= 0;
1128 tp
->fackets_out
= 0;
1129 tp
->retrans_out
= 0;
1132 /* We don't have a timestamp. Can only use
1133 * packets that are not retransmitted to determine
1134 * rtt estimates. Also, we must not reset the
1135 * backoff for rto until we get a non-retransmitted
1136 * packet. This allows us to deal with a situation
1137 * where the network delay has increased suddenly.
1138 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1140 if (flag
& (FLAG_DATA_ACKED
|FLAG_SYN_ACKED
)) {
1141 if(!(flag
& FLAG_RETRANS_DATA_ACKED
)) {
1143 tcp_rtt_estimator(tp
, seq_rtt
);
1151 if (tp
->packets_out
) {
1152 if (flag
& FLAG_DATA_ACKED
)
1153 tcp_ack_packets_out(sk
, tp
);
1155 tcp_clear_xmit_timer(sk
, TCP_TIME_RETRANS
);
1158 flag
&= (FLAG_DATA
| FLAG_WIN_UPDATE
);
1159 if ((ack
== tp
->snd_una
&& tp
->packets_out
&& flag
== 0) ||
1160 (tp
->high_seq
!= 0)) {
1161 tcp_fast_retrans(sk
, ack
, flag
);
1163 /* Clear any aborted fast retransmit starts. */
1166 /* It is not a brain fart, I thought a bit now. 8)
1168 * Forward progress is indicated, if:
1169 * 1. the ack acknowledges new data.
1170 * 2. or the ack is duplicate, but it is caused by new segment
1171 * arrival. This case is filtered by:
1172 * - it contains no data, syn or fin.
1173 * - it does not update window.
1174 * 3. or new SACK. It is difficult to check, so that we ignore it.
1176 * Forward progress is also indicated by arrival new data,
1177 * which was caused by window open from our side. This case is more
1178 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1181 if (ack
!= tp
->snd_una
|| (flag
== 0 && !th
->fin
))
1182 dst_confirm(sk
->dst_cache
);
1184 if (ack
!= tp
->snd_una
)
1187 /* Remember the highest ack received. */
1192 SOCK_DEBUG(sk
, "Ack ignored %u %u\n", ack
, tp
->snd_nxt
);
1196 int tcp_paws_check(struct tcp_opt
*tp
, int rst
)
1198 if ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) >= 0)
1200 if (xtime
.tv_sec
>= tp
->ts_recent_stamp
+ TCP_PAWS_24DAYS
)
1203 /* RST segments are not recommended to carry timestamp,
1204 and, if they do, it is recommended to ignore PAWS because
1205 "their cleanup function should take precedence over timestamps."
1206 Certainly, it is mistake. It is necessary to understand the reasons
1207 of this constraint to relax it: if peer reboots, clock may go
1208 out-of-sync and half-open connections will not be reset.
1209 Actually, the problem would be not existing if all
1210 the implementations followed draft about maintaining clock
1211 via reboots. Linux-2.2 DOES NOT!
1213 However, we can relax time bounds for RST segments to MSL.
1215 if (rst
&& xtime
.tv_sec
>= tp
->ts_recent_stamp
+ TCP_PAWS_MSL
)
1220 static __inline__
int tcp_in_window(u32 seq
, u32 end_seq
, u32 s_win
, u32 e_win
)
1224 if (after(end_seq
, s_win
) && before(seq
, e_win
))
1226 return (seq
== e_win
&& seq
== end_seq
);
1229 /* New-style handling of TIME_WAIT sockets. */
1231 /* Must be called with locally disabled BHs. */
1232 void tcp_timewait_kill(struct tcp_tw_bucket
*tw
)
1234 struct tcp_ehash_bucket
*ehead
;
1235 struct tcp_bind_hashbucket
*bhead
;
1236 struct tcp_bind_bucket
*tb
;
1238 /* Unlink from established hashes. */
1239 ehead
= &tcp_ehash
[tw
->hashent
];
1240 write_lock(&ehead
->lock
);
1242 write_unlock(&ehead
->lock
);
1246 tw
->next
->pprev
= tw
->pprev
;
1247 *(tw
->pprev
) = tw
->next
;
1249 write_unlock(&ehead
->lock
);
1251 /* Disassociate with bind bucket. */
1252 bhead
= &tcp_bhash
[tcp_bhashfn(tw
->num
)];
1253 spin_lock(&bhead
->lock
);
1254 if ((tb
= tw
->tb
) != NULL
) {
1256 tw
->bind_next
->bind_pprev
= tw
->bind_pprev
;
1257 *(tw
->bind_pprev
) = tw
->bind_next
;
1259 if (tb
->owners
== NULL
) {
1261 tb
->next
->pprev
= tb
->pprev
;
1262 *(tb
->pprev
) = tb
->next
;
1263 kmem_cache_free(tcp_bucket_cachep
, tb
);
1266 spin_unlock(&bhead
->lock
);
1268 #ifdef INET_REFCNT_DEBUG
1269 if (atomic_read(&tw
->refcnt
) != 1) {
1270 printk(KERN_DEBUG
"tw_bucket %p refcnt=%d\n", tw
, atomic_read(&tw
->refcnt
));
1277 * * Main purpose of TIME-WAIT state is to close connection gracefully,
1278 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1279 * (and, probably, tail of data) and one or more our ACKs are lost.
1280 * * What is TIME-WAIT timeout? It is associated with maximal packet
1281 * lifetime in the internet, which results in wrong conclusion, that
1282 * it is set to catch "old duplicate segments" wandering out of their path.
1283 * It is not quite correct. This timeout is calculated so that it exceeds
1284 * maximal retransmision timeout enough to allow to lose one (or more)
1285 * segments sent by peer and our ACKs. This time may be calculated from RTO.
1286 * * When TIME-WAIT socket receives RST, it means that another end
1287 * finally closed and we are allowed to kill TIME-WAIT too.
1288 * * Second purpose of TIME-WAIT is catching old duplicate segments.
1289 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
1290 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1291 * * If we invented some more clever way to catch duplicates
1292 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1294 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1295 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1296 * from the very beginning.
1298 * NOTE. With recycling (and later with fin-wait-2) TW bucket
1299 * is _not_ stateless. It means, that strictly speaking we must
1300 * spinlock it. I do not want! Well, probability of misbehaviour
1301 * is ridiculously low and, seems, we could use some mb() tricks
1302 * to avoid misread sequence numbers, states etc. --ANK
1305 tcp_timewait_state_process(struct tcp_tw_bucket
*tw
, struct sk_buff
*skb
,
1306 struct tcphdr
*th
, unsigned len
)
1309 int paws_reject
= 0;
1312 if (th
->doff
> (sizeof(struct tcphdr
)>>2) && tw
->ts_recent_stamp
) {
1313 tcp_parse_options(NULL
, th
, &tp
, 0);
1315 if (tp
.saw_tstamp
) {
1316 tp
.ts_recent
= tw
->ts_recent
;
1317 tp
.ts_recent_stamp
= tw
->ts_recent_stamp
;
1318 paws_reject
= tcp_paws_check(&tp
, th
->rst
);
1322 if (tw
->substate
== TCP_FIN_WAIT2
) {
1323 /* Just repeat all the checks of tcp_rcv_state_process() */
1325 /* Out of window, send ACK */
1327 !tcp_in_window(TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
,
1328 tw
->rcv_nxt
, tw
->rcv_nxt
+ tw
->rcv_wnd
))
1334 if (th
->syn
&& TCP_SKB_CB(skb
)->seq
!= tw
->syn_seq
)
1338 if (!after(TCP_SKB_CB(skb
)->end_seq
, tw
->rcv_nxt
) ||
1339 TCP_SKB_CB(skb
)->end_seq
== TCP_SKB_CB(skb
)->seq
) {
1341 return TCP_TW_SUCCESS
;
1344 /* New data or FIN. If new data arrive after half-duplex close,
1347 if (!th
->fin
|| TCP_SKB_CB(skb
)->end_seq
!= tw
->rcv_nxt
+1) {
1349 tcp_tw_deschedule(tw
);
1350 tcp_timewait_kill(tw
);
1355 /* FIN arrived, enter true time-wait state. */
1356 tw
->substate
= TCP_TIME_WAIT
;
1357 tw
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1358 if (tp
.saw_tstamp
) {
1359 tw
->ts_recent_stamp
= xtime
.tv_sec
;
1360 tw
->ts_recent
= tp
.rcv_tsval
;
1363 /* I am shamed, but failed to make it more elegant.
1364 * Yes, it is direct reference to IP, which is impossible
1365 * to generalize to IPv6. Taking into account that IPv6
1366 * do not undertsnad recycling in any case, it not
1367 * a big problem in practice. --ANK */
1368 if (tw
->family
== AF_INET
&&
1369 sysctl_tcp_tw_recycle
&& tw
->ts_recent_stamp
&&
1370 tcp_v4_tw_remember_stamp(tw
))
1371 tcp_tw_schedule(tw
, tw
->timeout
);
1373 tcp_tw_schedule(tw
, TCP_TIMEWAIT_LEN
);
1378 * Now real TIME-WAIT state.
1381 * "When a connection is [...] on TIME-WAIT state [...]
1382 * [a TCP] MAY accept a new SYN from the remote TCP to
1383 * reopen the connection directly, if it:
1385 * (1) assigns its initial sequence number for the new
1386 * connection to be larger than the largest sequence
1387 * number it used on the previous connection incarnation,
1390 * (2) returns to TIME-WAIT state if the SYN turns out
1391 * to be an old duplicate".
1395 (TCP_SKB_CB(skb
)->seq
== TCP_SKB_CB(skb
)->end_seq
&&
1396 TCP_SKB_CB(skb
)->seq
== tw
->rcv_nxt
)) {
1397 /* In window segment, it may be only reset or bare ack. */
1400 /* This is TIME_WAIT assasination, in two flavors.
1401 * Oh well... nobody has a sufficient solution to this
1404 if (sysctl_tcp_rfc1337
== 0) {
1406 tcp_tw_deschedule(tw
);
1407 tcp_timewait_kill(tw
);
1409 return TCP_TW_SUCCESS
;
1412 tcp_tw_schedule(tw
, TCP_TIMEWAIT_LEN
);
1414 if (tp
.saw_tstamp
) {
1415 tw
->ts_recent
= tp
.rcv_tsval
;
1416 tw
->ts_recent_stamp
= xtime
.tv_sec
;
1420 return TCP_TW_SUCCESS
;
1423 /* Out of window segment.
1425 All the segments are ACKed immediately.
1427 The only exception is new SYN. We accept it, if it is
1428 not old duplicate and we are not in danger to be killed
1429 by delayed old duplicates. RFC check is that it has
1430 newer sequence number works at rates <40Mbit/sec.
1431 However, if paws works, it is reliable AND even more,
1432 we even may relax silly seq space cutoff.
1434 RED-PEN: we violate main RFC requirement, if this SYN will appear
1435 old duplicate (i.e. we receive RST in reply to SYN-ACK),
1436 we must return socket to time-wait state. It is not good,
1440 if (th
->syn
&& !th
->rst
&& !th
->ack
&& !paws_reject
&&
1441 (after(TCP_SKB_CB(skb
)->seq
, tw
->rcv_nxt
) ||
1442 (tp
.saw_tstamp
&& (s32
)(tw
->ts_recent
- tp
.rcv_tsval
) < 0))) {
1443 u32 isn
= tw
->snd_nxt
+ 2;
1446 TCP_SKB_CB(skb
)->when
= isn
;
1451 NET_INC_STATS_BH(PAWSEstabRejected
);
1454 /* In this case we must reset the TIMEWAIT timer.
1456 * If it is ACKless SYN it may be both old duplicate
1457 * and new good SYN with random sequence number <rcv_nxt.
1458 * Do not reschedule in the last case.
1460 if (paws_reject
|| th
->ack
)
1461 tcp_tw_schedule(tw
, TCP_TIMEWAIT_LEN
);
1463 /* Send ACK. Note, we do not put the bucket,
1464 * it will be released by caller.
1469 return TCP_TW_SUCCESS
;
1472 /* Enter the time wait state. This is called with locally disabled BH.
1473 * Essentially we whip up a timewait bucket, copy the
1474 * relevant info into it from the SK, and mess with hash chains
1477 static void __tcp_tw_hashdance(struct sock
*sk
, struct tcp_tw_bucket
*tw
)
1479 struct tcp_ehash_bucket
*ehead
= &tcp_ehash
[sk
->hashent
];
1480 struct tcp_bind_hashbucket
*bhead
;
1481 struct sock
**head
, *sktw
;
1483 write_lock(&ehead
->lock
);
1485 /* Step 1: Remove SK from established hash. */
1488 sk
->next
->pprev
= sk
->pprev
;
1489 *sk
->pprev
= sk
->next
;
1491 sock_prot_dec_use(sk
->prot
);
1494 /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1495 head
= &(ehead
+ tcp_ehash_size
)->chain
;
1496 sktw
= (struct sock
*)tw
;
1497 if((sktw
->next
= *head
) != NULL
)
1498 (*head
)->pprev
= &sktw
->next
;
1501 atomic_inc(&tw
->refcnt
);
1503 write_unlock(&ehead
->lock
);
1505 /* Step 3: Put TW into bind hash. Original socket stays there too.
1506 Note, that any socket with sk->num!=0 MUST be bound in binding
1507 cache, even if it is closed.
1509 bhead
= &tcp_bhash
[tcp_bhashfn(sk
->num
)];
1510 spin_lock(&bhead
->lock
);
1511 tw
->tb
= (struct tcp_bind_bucket
*)sk
->prev
;
1512 BUG_TRAP(sk
->prev
!=NULL
);
1513 if ((tw
->bind_next
= tw
->tb
->owners
) != NULL
)
1514 tw
->tb
->owners
->bind_pprev
= &tw
->bind_next
;
1515 tw
->tb
->owners
= (struct sock
*)tw
;
1516 tw
->bind_pprev
= &tw
->tb
->owners
;
1517 spin_unlock(&bhead
->lock
);
1521 * Move a socket to time-wait or dead fin-wait-2 state.
1523 void tcp_time_wait(struct sock
*sk
, int state
, int timeo
)
1525 struct tcp_tw_bucket
*tw
= NULL
;
1526 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1529 if (sysctl_tcp_tw_recycle
&& tp
->ts_recent_stamp
)
1530 recycle_ok
= tp
->af_specific
->remember_stamp(sk
);
1532 if (tcp_tw_count
< sysctl_tcp_max_tw_buckets
)
1533 tw
= kmem_cache_alloc(tcp_timewait_cachep
, SLAB_ATOMIC
);
1536 int rto
= (tp
->rto
<<2) - (tp
->rto
>>1);
1538 /* Give us an identity. */
1539 tw
->daddr
= sk
->daddr
;
1540 tw
->rcv_saddr
= sk
->rcv_saddr
;
1541 tw
->bound_dev_if
= sk
->bound_dev_if
;
1543 tw
->state
= TCP_TIME_WAIT
;
1544 tw
->substate
= state
;
1545 tw
->sport
= sk
->sport
;
1546 tw
->dport
= sk
->dport
;
1547 tw
->family
= sk
->family
;
1548 tw
->reuse
= sk
->reuse
;
1549 tw
->rcv_wscale
= tp
->rcv_wscale
;
1550 atomic_set(&tw
->refcnt
, 0);
1552 tw
->hashent
= sk
->hashent
;
1553 tw
->rcv_nxt
= tp
->rcv_nxt
;
1554 tw
->snd_nxt
= tp
->snd_nxt
;
1555 tw
->rcv_wnd
= tcp_receive_window(tp
);
1556 tw
->syn_seq
= tp
->syn_seq
;
1557 tw
->ts_recent
= tp
->ts_recent
;
1558 tw
->ts_recent_stamp
= tp
->ts_recent_stamp
;
1559 tw
->pprev_death
= NULL
;
1561 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1562 if(tw
->family
== PF_INET6
) {
1563 memcpy(&tw
->v6_daddr
,
1564 &sk
->net_pinfo
.af_inet6
.daddr
,
1565 sizeof(struct in6_addr
));
1566 memcpy(&tw
->v6_rcv_saddr
,
1567 &sk
->net_pinfo
.af_inet6
.rcv_saddr
,
1568 sizeof(struct in6_addr
));
1571 /* Linkage updates. */
1572 __tcp_tw_hashdance(sk
, tw
);
1574 /* Get the TIME_WAIT timeout firing. */
1581 tw
->timeout
= TCP_TIMEWAIT_LEN
;
1582 if (state
== TCP_TIME_WAIT
)
1583 timeo
= TCP_TIMEWAIT_LEN
;
1586 tcp_tw_schedule(tw
, timeo
);
1588 /* Sorry, if we're out of memory, just CLOSE this
1589 * socket up. We've got bigger problems than
1590 * non-graceful socket closings.
1592 if (net_ratelimit())
1593 printk(KERN_INFO
"TCP: time wait bucket table overflow\n");
1596 tcp_update_metrics(sk
);
1601 * Process the FIN bit. This now behaves as it is supposed to work
1602 * and the FIN takes effect when it is validly part of sequence
1603 * space. Not before when we get holes.
1605 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1606 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1609 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1610 * close and we go into CLOSING (and later onto TIME-WAIT)
1612 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1615 static void tcp_fin(struct sk_buff
*skb
, struct sock
*sk
, struct tcphdr
*th
)
1617 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1619 tp
->fin_seq
= TCP_SKB_CB(skb
)->end_seq
;
1620 tp
->ack
.pending
= 1;
1623 sk
->shutdown
|= RCV_SHUTDOWN
;
1627 case TCP_ESTABLISHED
:
1628 /* Move to CLOSE_WAIT */
1629 tcp_set_state(sk
, TCP_CLOSE_WAIT
);
1632 case TCP_CLOSE_WAIT
:
1634 /* Received a retransmission of the FIN, do
1639 /* RFC793: Remain in the LAST-ACK state. */
1643 /* This case occurs when a simultaneous close
1644 * happens, we must ack the received FIN and
1645 * enter the CLOSING state.
1647 tcp_set_state(sk
, TCP_CLOSING
);
1650 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1652 tcp_time_wait(sk
, TCP_TIME_WAIT
, 0);
1655 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1656 * cases we should never reach this piece of code.
1658 printk("tcp_fin: Impossible, sk->state=%d\n", sk
->state
);
1662 /* It _is_ possible, that we have something out-of-order _after_ FIN.
1663 * Probably, we should reset in this case. For now drop them.
1665 __skb_queue_purge(&tp
->out_of_order_queue
);
1670 sk
->state_change(sk
);
1672 /* Do not send POLL_HUP for half duplex close. */
1673 if (sk
->shutdown
== SHUTDOWN_MASK
|| sk
->state
== TCP_CLOSE
)
1674 sk_wake_async(sk
, 1, POLL_HUP
);
1676 sk_wake_async(sk
, 1, POLL_IN
);
1680 /* These routines update the SACK block as out-of-order packets arrive or
1681 * in-order packets close up the sequence space.
1683 static void tcp_sack_maybe_coalesce(struct tcp_opt
*tp
, struct tcp_sack_block
*sp
)
1685 int this_sack
, num_sacks
= tp
->num_sacks
;
1686 struct tcp_sack_block
*swalk
= &tp
->selective_acks
[0];
1688 /* If more than one SACK block, see if the recent change to SP eats into
1689 * or hits the sequence space of other SACK blocks, if so coalesce.
1691 if(num_sacks
!= 1) {
1692 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, swalk
++) {
1696 /* First case, bottom of SP moves into top of the
1697 * sequence space of SWALK.
1699 if(between(sp
->start_seq
, swalk
->start_seq
, swalk
->end_seq
)) {
1700 sp
->start_seq
= swalk
->start_seq
;
1703 /* Second case, top of SP moves into bottom of the
1704 * sequence space of SWALK.
1706 if(between(sp
->end_seq
, swalk
->start_seq
, swalk
->end_seq
)) {
1707 sp
->end_seq
= swalk
->end_seq
;
1712 /* SP is the only SACK, or no coalescing cases found. */
1716 /* Zap SWALK, by moving every further SACK up by one slot.
1717 * Decrease num_sacks.
1719 for(; this_sack
< num_sacks
-1; this_sack
++, swalk
++) {
1720 struct tcp_sack_block
*next
= (swalk
+ 1);
1721 swalk
->start_seq
= next
->start_seq
;
1722 swalk
->end_seq
= next
->end_seq
;
1727 static __inline__
void tcp_sack_swap(struct tcp_sack_block
*sack1
, struct tcp_sack_block
*sack2
)
1731 tmp
= sack1
->start_seq
;
1732 sack1
->start_seq
= sack2
->start_seq
;
1733 sack2
->start_seq
= tmp
;
1735 tmp
= sack1
->end_seq
;
1736 sack1
->end_seq
= sack2
->end_seq
;
1737 sack2
->end_seq
= tmp
;
1740 static void tcp_sack_new_ofo_skb(struct sock
*sk
, struct sk_buff
*skb
)
1742 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1743 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1744 int cur_sacks
= tp
->num_sacks
;
1749 /* Optimize for the common case, new ofo frames arrive
1750 * "in order". ;-) This also satisfies the requirements
1751 * of RFC2018 about ordering of SACKs.
1753 if(sp
->end_seq
== TCP_SKB_CB(skb
)->seq
) {
1754 sp
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1755 tcp_sack_maybe_coalesce(tp
, sp
);
1756 } else if(sp
->start_seq
== TCP_SKB_CB(skb
)->end_seq
) {
1757 /* Re-ordered arrival, in this case, can be optimized
1760 sp
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1761 tcp_sack_maybe_coalesce(tp
, sp
);
1763 struct tcp_sack_block
*swap
= sp
+ 1;
1764 int this_sack
, max_sacks
= (tp
->tstamp_ok
? 3 : 4);
1766 /* Oh well, we have to move things around.
1767 * Try to find a SACK we can tack this onto.
1770 for(this_sack
= 1; this_sack
< cur_sacks
; this_sack
++, swap
++) {
1771 if((swap
->end_seq
== TCP_SKB_CB(skb
)->seq
) ||
1772 (swap
->start_seq
== TCP_SKB_CB(skb
)->end_seq
)) {
1773 if(swap
->end_seq
== TCP_SKB_CB(skb
)->seq
)
1774 swap
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1776 swap
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1777 tcp_sack_swap(sp
, swap
);
1778 tcp_sack_maybe_coalesce(tp
, sp
);
1783 /* Could not find an adjacent existing SACK, build a new one,
1784 * put it at the front, and shift everyone else down. We
1785 * always know there is at least one SACK present already here.
1787 * If the sack array is full, forget about the last one.
1789 if (cur_sacks
>= max_sacks
) {
1793 while(cur_sacks
>= 1) {
1794 struct tcp_sack_block
*this = &tp
->selective_acks
[cur_sacks
];
1795 struct tcp_sack_block
*prev
= (this - 1);
1796 this->start_seq
= prev
->start_seq
;
1797 this->end_seq
= prev
->end_seq
;
1802 /* Build the new head SACK, and we're done. */
1803 sp
->start_seq
= TCP_SKB_CB(skb
)->seq
;
1804 sp
->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1809 static void tcp_sack_remove_skb(struct tcp_opt
*tp
, struct sk_buff
*skb
)
1811 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1812 int num_sacks
= tp
->num_sacks
;
1815 /* This is an in order data segment _or_ an out-of-order SKB being
1816 * moved to the receive queue, so we know this removed SKB will eat
1817 * from the front of a SACK.
1819 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, sp
++) {
1820 /* Check if the start of the sack is covered by skb. */
1821 if(!before(sp
->start_seq
, TCP_SKB_CB(skb
)->seq
) &&
1822 before(sp
->start_seq
, TCP_SKB_CB(skb
)->end_seq
))
1826 /* This should only happen if so many SACKs get built that some get
1827 * pushed out before we get here, or we eat some in sequence packets
1828 * which are before the first SACK block.
1830 if(this_sack
>= num_sacks
)
1833 sp
->start_seq
= TCP_SKB_CB(skb
)->end_seq
;
1834 if(!before(sp
->start_seq
, sp
->end_seq
)) {
1835 /* Zap this SACK, by moving forward any other SACKS. */
1836 for(this_sack
+= 1; this_sack
< num_sacks
; this_sack
++, sp
++) {
1837 struct tcp_sack_block
*next
= (sp
+ 1);
1838 sp
->start_seq
= next
->start_seq
;
1839 sp
->end_seq
= next
->end_seq
;
1845 static void tcp_sack_extend(struct tcp_opt
*tp
, struct sk_buff
*old_skb
, struct sk_buff
*new_skb
)
1847 struct tcp_sack_block
*sp
= &tp
->selective_acks
[0];
1848 int num_sacks
= tp
->num_sacks
;
1851 for(this_sack
= 0; this_sack
< num_sacks
; this_sack
++, sp
++) {
1852 if(sp
->end_seq
== TCP_SKB_CB(old_skb
)->end_seq
)
1855 if(this_sack
>= num_sacks
)
1857 sp
->end_seq
= TCP_SKB_CB(new_skb
)->end_seq
;
1861 /* This one checks to see if we can put data from the
1862 * out_of_order queue into the receive_queue.
1864 static void tcp_ofo_queue(struct sock
*sk
)
1866 struct sk_buff
*skb
;
1867 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1869 while ((skb
= skb_peek(&tp
->out_of_order_queue
))) {
1870 if (after(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
))
1873 if (!after(TCP_SKB_CB(skb
)->end_seq
, tp
->rcv_nxt
)) {
1874 SOCK_DEBUG(sk
, "ofo packet was already received \n");
1875 __skb_unlink(skb
, skb
->list
);
1879 SOCK_DEBUG(sk
, "ofo requeuing : rcv_next %X seq %X - %X\n",
1880 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
,
1881 TCP_SKB_CB(skb
)->end_seq
);
1884 tcp_sack_remove_skb(tp
, skb
);
1885 __skb_unlink(skb
, skb
->list
);
1886 __skb_queue_tail(&sk
->receive_queue
, skb
);
1887 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1889 tcp_fin(skb
, sk
, skb
->h
.th
);
1893 static void tcp_data_queue(struct sock
*sk
, struct sk_buff
*skb
)
1895 struct sk_buff
*skb1
;
1896 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1899 /* Queue data for delivery to the user.
1900 * Packets in sequence go to the receive queue.
1901 * Out of sequence packets to the out_of_order_queue.
1903 if (TCP_SKB_CB(skb
)->seq
== tp
->rcv_nxt
) {
1904 /* Ok. In sequence. */
1905 if (tp
->ucopy
.task
== current
&&
1906 tp
->copied_seq
== tp
->rcv_nxt
&&
1910 int chunk
= min(skb
->len
, tp
->ucopy
.len
);
1912 __set_current_state(TASK_RUNNING
);
1915 if (memcpy_toiovec(tp
->ucopy
.iov
, skb
->data
, chunk
)) {
1917 sk
->error_report(sk
);
1920 tp
->ucopy
.len
-= chunk
;
1921 tp
->copied_seq
+= chunk
;
1922 eaten
= (chunk
== skb
->len
&& !skb
->h
.th
->fin
);
1927 skb_set_owner_r(skb
, sk
);
1928 __skb_queue_tail(&sk
->receive_queue
, skb
);
1930 dst_confirm(sk
->dst_cache
);
1931 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1933 tcp_event_data_recv(tp
, skb
);
1935 tcp_fin(skb
, sk
, skb
->h
.th
);
1937 /* This may have eaten into a SACK block. */
1938 if(tp
->sack_ok
&& tp
->num_sacks
)
1939 tcp_sack_remove_skb(tp
, skb
);
1942 /* Turn on fast path. */
1943 if (skb_queue_len(&tp
->out_of_order_queue
) == 0 &&
1944 #ifdef TCP_FORMAL_WINDOW
1945 tcp_receive_window(tp
) &&
1948 tcp_fast_path_on(tp
);
1952 } else if (!sk
->dead
)
1953 sk
->data_ready(sk
, 0);
1957 /* An old packet, either a retransmit or some packet got lost. */
1958 if (!after(TCP_SKB_CB(skb
)->end_seq
, tp
->rcv_nxt
)) {
1959 /* A retransmit, 2nd most common case. Force an imediate ack.
1961 * It is impossible, seq is checked by top level.
1963 NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb
)->seq
));
1964 tcp_enter_quickack_mode(tp
);
1965 tp
->ack
.pending
= 1;
1970 if (before(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
)) {
1971 /* Partial packet, seq < rcv_next < end_seq */
1972 SOCK_DEBUG(sk
, "partial packet: rcv_next %X seq %X - %X\n",
1973 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
,
1974 TCP_SKB_CB(skb
)->end_seq
);
1979 /* Ok. This is an out_of_order segment, force an ack. */
1980 tp
->ack
.pending
= 1;
1982 /* Disable header prediction. */
1986 SOCK_DEBUG(sk
, "out of order segment: rcv_next %X seq %X - %X\n",
1987 tp
->rcv_nxt
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
);
1989 skb_set_owner_r(skb
, sk
);
1991 if (skb_peek(&tp
->out_of_order_queue
) == NULL
) {
1992 /* Initial out of order segment, build 1 SACK. */
1995 tp
->selective_acks
[0].start_seq
= TCP_SKB_CB(skb
)->seq
;
1996 tp
->selective_acks
[0].end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1998 __skb_queue_head(&tp
->out_of_order_queue
,skb
);
2000 for(skb1
=tp
->out_of_order_queue
.prev
; ; skb1
= skb1
->prev
) {
2001 /* Already there. */
2002 if (TCP_SKB_CB(skb
)->seq
== TCP_SKB_CB(skb1
)->seq
) {
2003 if (skb
->len
>= skb1
->len
) {
2005 tcp_sack_extend(tp
, skb1
, skb
);
2006 __skb_append(skb1
, skb
);
2007 __skb_unlink(skb1
, skb1
->list
);
2010 /* A duplicate, smaller than what is in the
2011 * out-of-order queue right now, toss it.
2018 if (after(TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb1
)->seq
)) {
2019 __skb_append(skb1
, skb
);
2021 tcp_sack_new_ofo_skb(sk
, skb
);
2025 /* See if we've hit the start. If so insert. */
2026 if (skb1
== skb_peek(&tp
->out_of_order_queue
)) {
2027 __skb_queue_head(&tp
->out_of_order_queue
,skb
);
2029 tcp_sack_new_ofo_skb(sk
, skb
);
2039 * This routine handles the data. If there is room in the buffer,
2040 * it will be have already been moved into it. If there is no
2041 * room, then we will just have to discard the packet.
2044 static void tcp_data(struct sk_buff
*skb
, struct sock
*sk
, unsigned int len
)
2047 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2050 skb_pull(skb
, th
->doff
*4);
2051 skb_trim(skb
, len
- (th
->doff
*4));
2053 if (skb
->len
== 0 && !th
->fin
)
2057 * If our receive queue has grown past its limits shrink it.
2058 * Make sure to do this before moving rcv_nxt, otherwise
2059 * data might be acked for that we don't have enough room.
2061 if (atomic_read(&sk
->rmem_alloc
) > sk
->rcvbuf
) {
2062 if (prune_queue(sk
) < 0) {
2063 /* Still not enough room. That can happen when
2064 * skb->true_size differs significantly from skb->len.
2070 tcp_data_queue(sk
, skb
);
2072 if (before(tp
->rcv_nxt
, tp
->copied_seq
)) {
2073 printk(KERN_DEBUG
"*** tcp.c:tcp_data bug acked < copied\n");
2074 tp
->rcv_nxt
= tp
->copied_seq
;
2082 /* When incoming ACK allowed to free some skb from write_queue,
2083 * we remember this in flag tp->sorry and wake up socket on the exit
2084 * from tcp input handler. Probably, handler has already eat this space
2085 * sending ACK and cloned frames from tcp_write_xmit().
2087 static __inline__
void tcp_new_space(struct sock
*sk
)
2089 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2090 struct socket
*sock
;
2094 if (sock_wspace(sk
) >= tcp_min_write_space(sk
) &&
2095 (sock
= sk
->socket
) != NULL
) {
2096 clear_bit(SOCK_NOSPACE
, &sock
->flags
);
2098 if (sk
->sleep
&& waitqueue_active(sk
->sleep
))
2099 wake_up_interruptible(sk
->sleep
);
2101 if (sock
->fasync_list
)
2102 sock_wake_async(sock
, 2, POLL_OUT
);
2106 static void __tcp_data_snd_check(struct sock
*sk
, struct sk_buff
*skb
)
2108 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2110 if (after(TCP_SKB_CB(skb
)->end_seq
, tp
->snd_una
+ tp
->snd_wnd
) ||
2111 tcp_packets_in_flight(tp
) >= tp
->snd_cwnd
||
2113 tcp_check_probe_timer(sk
, tp
);
2116 static __inline__
void tcp_data_snd_check(struct sock
*sk
)
2118 struct sk_buff
*skb
= sk
->tp_pinfo
.af_tcp
.send_head
;
2121 __tcp_data_snd_check(sk
, skb
);
2125 * Check if sending an ack is needed.
2127 static __inline__
void __tcp_ack_snd_check(struct sock
*sk
, int ofo_possible
)
2129 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2131 /* This also takes care of updating the window.
2132 * This if statement needs to be simplified.
2134 * Rules for delaying an ack:
2135 * - delay time <= 0.5 HZ
2136 * - we don't have a window update to send
2137 * - must send at least every 2 full sized packets
2138 * - must send an ACK if we have any out of order data
2140 * With an extra heuristic to handle loss of packet
2141 * situations and also helping the sender leave slow
2142 * start in an expediant manner.
2145 /* More than one full frame received or... */
2146 if (((tp
->rcv_nxt
- tp
->rcv_wup
) > tp
->ack
.rcv_mss
2147 #ifdef TCP_MORE_COARSE_ACKS
2148 /* Avoid to send immediate ACK from input path, if it
2149 * does not advance window far enough. tcp_recvmsg() will do this.
2151 && (!sysctl_tcp_retrans_collapse
|| __tcp_select_window(sk
) >= tp
->rcv_wnd
)
2154 /* We ACK each frame or... */
2155 tcp_in_quickack_mode(tp
) ||
2156 /* We have out of order data or */
2158 skb_peek(&tp
->out_of_order_queue
) != NULL
)) {
2159 /* Then ack it now */
2162 /* Else, send delayed ack. */
2163 tcp_send_delayed_ack(sk
);
2167 static __inline__
void tcp_ack_snd_check(struct sock
*sk
)
2169 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2170 if (tp
->ack
.pending
== 0) {
2171 /* We sent a data segment already. */
2174 __tcp_ack_snd_check(sk
, 1);
2179 * This routine is only called when we have urgent data
2180 * signalled. Its the 'slow' part of tcp_urg. It could be
2181 * moved inline now as tcp_urg is only called from one
2182 * place. We handle URGent data wrong. We have to - as
2183 * BSD still doesn't use the correction from RFC961.
2184 * For 1003.1g we should support a new option TCP_STDURG to permit
2185 * either form (or just set the sysctl tcp_stdurg).
2188 static void tcp_check_urg(struct sock
* sk
, struct tcphdr
* th
)
2190 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2191 u32 ptr
= ntohs(th
->urg_ptr
);
2193 if (ptr
&& !sysctl_tcp_stdurg
)
2195 ptr
+= ntohl(th
->seq
);
2197 /* Ignore urgent data that we've already seen and read. */
2198 if (after(tp
->copied_seq
, ptr
))
2201 /* Do we already have a newer (or duplicate) urgent pointer? */
2202 if (tp
->urg_data
&& !after(ptr
, tp
->urg_seq
))
2205 /* Tell the world about our new urgent pointer. */
2206 if (sk
->proc
!= 0) {
2208 kill_proc(sk
->proc
, SIGURG
, 1);
2210 kill_pg(-sk
->proc
, SIGURG
, 1);
2211 sk_wake_async(sk
, 3, POLL_PRI
);
2214 /* We may be adding urgent data when the last byte read was
2215 * urgent. To do this requires some care. We cannot just ignore
2216 * tp->copied_seq since we would read the last urgent byte again
2217 * as data, nor can we alter copied_seq until this data arrives
2218 * or we break the sematics of SIOCATMARK (and thus sockatmark())
2220 if (tp
->urg_seq
== tp
->copied_seq
)
2221 tp
->copied_seq
++; /* Move the copied sequence on correctly */
2222 tp
->urg_data
= TCP_URG_NOTYET
;
2225 /* Disable header prediction. */
2229 /* This is the 'fast' part of urgent handling. */
2230 static inline void tcp_urg(struct sock
*sk
, struct tcphdr
*th
, unsigned long len
)
2232 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2234 /* Check if we get a new urgent pointer - normally not. */
2236 tcp_check_urg(sk
,th
);
2238 /* Do we wait for any urgent data? - normally not... */
2239 if (tp
->urg_data
== TCP_URG_NOTYET
) {
2240 u32 ptr
= tp
->urg_seq
- ntohl(th
->seq
) + (th
->doff
*4);
2242 /* Is the urgent pointer pointing into this packet? */
2244 tp
->urg_data
= TCP_URG_VALID
| *(ptr
+ (unsigned char *) th
);
2246 sk
->data_ready(sk
,0);
2251 /* Clean the out_of_order queue if we can, trying to get
2252 * the socket within its memory limits again.
2254 * Return less than zero if we should start dropping frames
2255 * until the socket owning process reads some of the data
2256 * to stabilize the situation.
2258 static int prune_queue(struct sock
*sk
)
2260 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2261 struct sk_buff
*skb
;
2264 SOCK_DEBUG(sk
, "prune_queue: c=%x\n", tp
->copied_seq
);
2266 NET_INC_STATS_BH(PruneCalled
);
2268 /* First, purge the out_of_order queue. */
2269 skb
= __skb_dequeue_tail(&tp
->out_of_order_queue
);
2274 net_statistics
[smp_processor_id()*2].OfoPruned
+= skb
->len
;
2276 skb
= __skb_dequeue_tail(&tp
->out_of_order_queue
);
2277 } while(skb
!= NULL
);
2279 /* Reset SACK state. A conforming SACK implementation will
2280 * do the same at a timeout based retransmit. When a connection
2281 * is in a sad state like this, we care only about integrity
2282 * of the connection not performance.
2288 /* If we are really being abused, tell the caller to silently
2289 * drop receive data on the floor. It will get retransmitted
2290 * and hopefully then we'll have sufficient space.
2292 * We used to try to purge the in-order packets too, but that
2293 * turns out to be deadly and fraught with races. Consider:
2295 * 1) If we acked the data, we absolutely cannot drop the
2296 * packet. This data would then never be retransmitted.
2297 * 2) It is possible, with a proper sequence of events involving
2298 * delayed acks and backlog queue handling, to have the user
2299 * read the data before it gets acked. The previous code
2300 * here got this wrong, and it lead to data corruption.
2301 * 3) Too much state changes happen when the FIN arrives, so once
2302 * we've seen that we can't remove any in-order data safely.
2304 * The net result is that removing in-order receive data is too
2305 * complex for anyones sanity. So we don't do it anymore. But
2306 * if we are really having our buffer space abused we stop accepting
2309 * 8) The arguments are interesting, but I even cannot imagine
2310 * what kind of arguments could force us to drop NICE, ALREADY
2311 * RECEIVED DATA only to get one more packet? --ANK
2313 * FIXME: it should recompute SACK state and only remove enough
2314 * buffers to get into bounds again. The current scheme loses
2315 * badly sometimes on links with large RTT, especially when
2316 * the driver has high overhead per skb.
2317 * (increasing the rcvbuf is not enough because it inflates the
2318 * the window too, disabling flow control effectively) -AK
2320 * Mmm... Why not to scale it seprately then? Just replace
2321 * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
2322 * and adjust it dynamically, when TCP window flow control
2328 if(atomic_read(&sk
->rmem_alloc
) < (sk
->rcvbuf
<< 1))
2331 NET_INC_STATS_BH(RcvPruned
);
2333 /* Massive buffer overcommit. */
2337 static int tcp_copy_to_iovec(struct sock
*sk
, struct sk_buff
*skb
, int hlen
)
2339 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2340 int chunk
= skb
->len
- hlen
;
2344 if (skb
->ip_summed
==CHECKSUM_UNNECESSARY
)
2345 err
= memcpy_toiovec(tp
->ucopy
.iov
, skb
->h
.raw
+ hlen
, chunk
);
2347 err
= copy_and_csum_toiovec(tp
->ucopy
.iov
, skb
, hlen
);
2351 tp
->ucopy
.len
-= chunk
;
2352 tp
->copied_seq
+= chunk
;
2357 if (err
== -EFAULT
) {
2359 sk
->error_report(sk
);
2367 static int __tcp_checksum_complete_user(struct sock
*sk
, struct sk_buff
*skb
)
2371 if (sk
->lock
.users
) {
2373 result
= __tcp_checksum_complete(skb
);
2376 result
= __tcp_checksum_complete(skb
);
2381 static __inline__
int
2382 tcp_checksum_complete_user(struct sock
*sk
, struct sk_buff
*skb
)
2384 return skb
->ip_summed
!= CHECKSUM_UNNECESSARY
&&
2385 __tcp_checksum_complete_user(sk
, skb
);
2389 * TCP receive function for the ESTABLISHED state.
2391 * It is split into a fast path and a slow path. The fast path is
2393 * - A zero window was announced from us - zero window probing
2394 * is only handled properly in the slow path.
2395 * [ NOTE: actually, it was made incorrectly and nobody ever noticed
2396 * this! Reason is clear: 1. Correct senders do not send
2397 * to zero window. 2. Even if a sender sends to zero window,
2398 * nothing terrible occurs.
2400 * For now I cleaned this and fast path is really always disabled,
2401 * when window is zero, but I would be more happy to remove these
2402 * checks. Code will be only cleaner and _faster_. --ANK
2404 * Later note. I've just found that slow path also accepts
2405 * out of window segments, look at tcp_sequence(). So...
2406 * it is the last argument: I repair all and comment out
2407 * repaired code by TCP_FORMAL_WINDOW.
2408 * [ I remember one rhyme from a chidren's book. (I apologize,
2409 * the trasnlation is not rhymed 8)): people in one (jewish) village
2410 * decided to build sauna, but divided to two parties.
2411 * The first one insisted that battens should not be dubbed,
2412 * another objected that foots will suffer of splinters,
2413 * the first fended that dubbed wet battens are too slippy
2414 * and people will fall and it is much more serious!
2415 * Certaiinly, all they went to rabbi.
2416 * After some thinking, he judged: "Do not be lazy!
2417 * Certainly, dub the battens! But put them by dubbed surface down."
2421 * - Out of order segments arrived.
2422 * - Urgent data is expected.
2423 * - There is no buffer space left
2424 * - Unexpected TCP flags/window values/header lengths are received
2425 * (detected by checking the TCP header against pred_flags)
2426 * - Data is sent in both directions. Fast path only supports pure senders
2427 * or pure receivers (this means either the sequence number or the ack
2428 * value must stay constant)
2429 * - Unexpected TCP option.
2431 * When these conditions are not satisfied it drops into a standard
2432 * receive procedure patterned after RFC793 to handle all cases.
2433 * The first three cases are guaranteed by proper pred_flags setting,
2434 * the rest is checked inline. Fast processing is turned on in
2435 * tcp_data_queue when everything is OK.
2437 int tcp_rcv_established(struct sock
*sk
, struct sk_buff
*skb
,
2438 struct tcphdr
*th
, unsigned len
)
2440 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2443 * Header prediction.
2444 * The code losely follows the one in the famous
2445 * "30 instruction TCP receive" Van Jacobson mail.
2447 * Van's trick is to deposit buffers into socket queue
2448 * on a device interrupt, to call tcp_recv function
2449 * on the receive process context and checksum and copy
2450 * the buffer to user space. smart...
2452 * Our current scheme is not silly either but we take the
2453 * extra cost of the net_bh soft interrupt processing...
2454 * We do checksum and copy also but from device to kernel.
2457 /* RED-PEN. Using static variables to pass function arguments
2458 * cannot be good idea...
2462 /* pred_flags is 0xS?10 << 16 + snd_wnd
2463 * if header_predition is to be made
2464 * 'S' will always be tp->tcp_header_len >> 2
2465 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2466 * turn it off (when there are holes in the receive
2467 * space for instance)
2468 * PSH flag is ignored.
2471 if ((tcp_flag_word(th
) & ~(TCP_RESERVED_BITS
|TCP_FLAG_PSH
)) == tp
->pred_flags
&&
2472 TCP_SKB_CB(skb
)->seq
== tp
->rcv_nxt
) {
2473 int tcp_header_len
= tp
->tcp_header_len
;
2475 /* Timestamp header prediction: tcp_header_len
2476 * is automatically equal to th->doff*4 due to pred_flags
2480 /* Check timestamp */
2481 if (tcp_header_len
== sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
) {
2482 __u32
*ptr
= (__u32
*)(th
+ 1);
2484 /* No? Slow path! */
2485 if (*ptr
!= __constant_ntohl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
2486 | (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
))
2491 tp
->rcv_tsval
= ntohl(*ptr
);
2493 tp
->rcv_tsecr
= ntohl(*ptr
);
2495 /* If PAWS failed, check it more carefully in slow path */
2496 if ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) < 0)
2499 /* Predicted packet is in window by definition.
2500 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
2501 * Hence, check seq<=rcv_wup reduces to:
2503 if (tp
->rcv_nxt
== tp
->rcv_wup
) {
2504 tp
->ts_recent
= tp
->rcv_tsval
;
2505 tp
->ts_recent_stamp
= xtime
.tv_sec
;
2509 if (len
<= tcp_header_len
) {
2510 /* Bulk data transfer: sender */
2511 if (len
== tcp_header_len
) {
2512 /* We know that such packets are checksummed
2515 tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
,
2516 TCP_SKB_CB(skb
)->ack_seq
, len
);
2518 tcp_data_snd_check(sk
);
2522 } else { /* Header too small */
2523 TCP_INC_STATS_BH(TcpInErrs
);
2526 } else if (TCP_SKB_CB(skb
)->ack_seq
== tp
->snd_una
) {
2529 if (tp
->ucopy
.task
== current
&&
2530 tp
->copied_seq
== tp
->rcv_nxt
&&
2531 len
- tcp_header_len
<= tp
->ucopy
.len
&&
2535 NET_INC_STATS_BH(TCPHPHitsToUser
);
2537 __set_current_state(TASK_RUNNING
);
2539 if (tcp_copy_to_iovec(sk
, skb
, tcp_header_len
))
2542 __skb_pull(skb
,tcp_header_len
);
2544 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
2546 if (tcp_checksum_complete_user(sk
, skb
))
2549 if (atomic_read(&sk
->rmem_alloc
) > sk
->rcvbuf
)
2552 NET_INC_STATS_BH(TCPHPHits
);
2554 /* Bulk data transfer: receiver */
2555 __skb_pull(skb
,tcp_header_len
);
2557 /* DO NOT notify forward progress here.
2558 * It saves dozen of CPU instructions in fast path. --ANK
2559 * And where is it signaled then ? -AK
2562 __skb_queue_tail(&sk
->receive_queue
, skb
);
2563 skb_set_owner_r(skb
, sk
);
2565 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
2567 /* FIN bit check is not done since if FIN is set in
2568 * this frame, the pred_flags won't match up. -DaveM
2570 sk
->data_ready(sk
, 0);
2573 tcp_event_data_recv(tp
, skb
);
2575 #ifdef TCP_MORE_COARSE_ACKS
2577 if (tcp_in_quickack_mode(tp
)) {
2580 tcp_send_delayed_ack(sk
);
2584 __tcp_ack_snd_check(sk
, 0);
2590 /* Packet is in sequence, flags are trivial;
2591 * only ACK is strange. Jump to step 5.
2593 if (tcp_checksum_complete_user(sk
, skb
))
2599 if (tcp_checksum_complete_user(sk
, skb
))
2603 * RFC1323: H1. Apply PAWS check first.
2605 if (tcp_fast_parse_options(sk
, th
, tp
) && tp
->saw_tstamp
&&
2606 tcp_paws_discard(tp
, skb
)) {
2608 NET_INC_STATS_BH(PAWSEstabRejected
);
2612 /* Resets are accepted even if PAWS failed.
2614 ts_recent update must be made after we are sure
2615 that the packet is in window.
2620 * Standard slow path.
2623 if (!tcp_sequence(tp
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
)) {
2624 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2625 * (RST) segments are validated by checking their SEQ-fields."
2626 * And page 69: "If an incoming segment is not acceptable,
2627 * an acknowledgment should be sent in reply (unless the RST bit
2628 * is set, if so drop the segment and return)".
2632 if (after(TCP_SKB_CB(skb
)->seq
, tp
->rcv_nxt
)) {
2633 SOCK_DEBUG(sk
, "seq:%d end:%d wup:%d wnd:%d\n",
2634 TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
,
2635 tp
->rcv_wup
, tp
->rcv_wnd
);
2637 tcp_enter_quickack_mode(tp
);
2639 NET_INC_STATS_BH(DelayedACKLost
);
2648 if (tp
->saw_tstamp
) {
2649 tcp_replace_ts_recent(sk
, tp
,
2650 TCP_SKB_CB(skb
)->seq
);
2653 if(th
->syn
&& TCP_SKB_CB(skb
)->seq
!= tp
->syn_seq
) {
2654 SOCK_DEBUG(sk
, "syn in established state\n");
2655 TCP_INC_STATS_BH(TcpInErrs
);
2662 tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->ack_seq
, len
);
2664 /* Process urgent data. */
2665 tcp_urg(sk
, th
, len
);
2667 /* step 7: process the segment text */
2668 tcp_data(skb
, sk
, len
);
2670 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2671 if(sk
->state
!= TCP_CLOSE
) {
2672 tcp_data_snd_check(sk
);
2673 tcp_ack_snd_check(sk
);
2681 TCP_INC_STATS_BH(TcpInErrs
);
2689 /* This is not only more efficient than what we used to do, it eliminates
2690 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2692 * Actually, we could lots of memory writes here. tp of listening
2693 * socket contains all necessary default parameters.
2695 struct sock
*tcp_create_openreq_child(struct sock
*sk
, struct open_request
*req
, struct sk_buff
*skb
)
2697 struct sock
*newsk
= sk_alloc(PF_INET
, GFP_ATOMIC
, 0);
2700 struct tcp_opt
*newtp
;
2701 #ifdef CONFIG_FILTER
2702 struct sk_filter
*filter
;
2705 memcpy(newsk
, sk
, sizeof(*newsk
));
2706 newsk
->state
= TCP_SYN_RECV
;
2709 newsk
->pprev
= NULL
;
2712 /* Clone the TCP header template */
2713 newsk
->dport
= req
->rmt_port
;
2715 sock_lock_init(newsk
);
2716 bh_lock_sock(newsk
);
2718 atomic_set(&newsk
->rmem_alloc
, 0);
2719 skb_queue_head_init(&newsk
->receive_queue
);
2720 atomic_set(&newsk
->wmem_alloc
, 0);
2721 skb_queue_head_init(&newsk
->write_queue
);
2722 atomic_set(&newsk
->omem_alloc
, 0);
2726 newsk
->backlog
.head
= newsk
->backlog
.tail
= NULL
;
2727 skb_queue_head_init(&newsk
->error_queue
);
2728 newsk
->write_space
= tcp_write_space
;
2729 #ifdef CONFIG_FILTER
2730 if ((filter
= newsk
->filter
) != NULL
)
2731 sk_filter_charge(newsk
, filter
);
2734 /* Now setup tcp_opt */
2735 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
2736 newtp
->pred_flags
= 0;
2737 newtp
->rcv_nxt
= req
->rcv_isn
+ 1;
2738 newtp
->snd_nxt
= req
->snt_isn
+ 1;
2739 newtp
->snd_una
= req
->snt_isn
+ 1;
2740 newtp
->snd_sml
= req
->snt_isn
+ 1;
2742 tcp_delack_init(newtp
);
2743 if (skb
->len
>= 536)
2744 newtp
->ack
.last_seg_size
= skb
->len
;
2746 tcp_prequeue_init(newtp
);
2748 newtp
->snd_wl1
= req
->rcv_isn
;
2749 newtp
->snd_wl2
= req
->snt_isn
;
2751 newtp
->retransmits
= 0;
2754 newtp
->mdev
= TCP_TIMEOUT_INIT
;
2755 newtp
->rto
= TCP_TIMEOUT_INIT
;
2757 newtp
->packets_out
= 0;
2758 newtp
->fackets_out
= 0;
2759 newtp
->retrans_out
= 0;
2760 newtp
->snd_ssthresh
= 0x7fffffff;
2762 /* So many TCP implementations out there (incorrectly) count the
2763 * initial SYN frame in their delayed-ACK and congestion control
2764 * algorithms that we must have the following bandaid to talk
2765 * efficiently to them. -DaveM
2767 newtp
->snd_cwnd
= 2;
2768 newtp
->snd_cwnd_cnt
= 0;
2769 newtp
->high_seq
= 0;
2771 newtp
->dup_acks
= 0;
2772 tcp_init_xmit_timers(newsk
);
2773 skb_queue_head_init(&newtp
->out_of_order_queue
);
2774 newtp
->send_head
= newtp
->retrans_head
= NULL
;
2775 newtp
->rcv_wup
= req
->rcv_isn
+ 1;
2776 newtp
->write_seq
= req
->snt_isn
+ 1;
2777 newtp
->copied_seq
= req
->rcv_isn
+ 1;
2779 newtp
->saw_tstamp
= 0;
2781 newtp
->probes_out
= 0;
2782 newtp
->num_sacks
= 0;
2783 newtp
->syn_seq
= req
->rcv_isn
;
2784 newtp
->fin_seq
= req
->rcv_isn
;
2785 newtp
->urg_data
= 0;
2786 newtp
->listen_opt
= NULL
;
2787 newtp
->accept_queue
= newtp
->accept_queue_tail
= NULL
;
2788 /* Deinitialize syn_wait_lock to trap illegal accesses. */
2789 memset(&newtp
->syn_wait_lock
, 0, sizeof(newtp
->syn_wait_lock
));
2791 /* Back to base struct sock members. */
2793 newsk
->priority
= 0;
2794 atomic_set(&newsk
->refcnt
, 1);
2795 #ifdef INET_REFCNT_DEBUG
2796 atomic_inc(&inet_sock_nr
);
2799 if (newsk
->keepopen
)
2800 tcp_reset_keepalive_timer(newsk
, keepalive_time_when(newtp
));
2801 newsk
->socket
= NULL
;
2802 newsk
->sleep
= NULL
;
2804 newtp
->tstamp_ok
= req
->tstamp_ok
;
2805 if((newtp
->sack_ok
= req
->sack_ok
) != 0)
2806 newtp
->num_sacks
= 0;
2807 newtp
->window_clamp
= req
->window_clamp
;
2808 newtp
->rcv_wnd
= req
->rcv_wnd
;
2809 newtp
->wscale_ok
= req
->wscale_ok
;
2810 if (newtp
->wscale_ok
) {
2811 newtp
->snd_wscale
= req
->snd_wscale
;
2812 newtp
->rcv_wscale
= req
->rcv_wscale
;
2814 newtp
->snd_wscale
= newtp
->rcv_wscale
= 0;
2815 newtp
->window_clamp
= min(newtp
->window_clamp
,65535);
2817 newtp
->snd_wnd
= ntohs(skb
->h
.th
->window
) << newtp
->snd_wscale
;
2818 newtp
->max_window
= newtp
->snd_wnd
;
2820 if (newtp
->tstamp_ok
) {
2821 newtp
->ts_recent
= req
->ts_recent
;
2822 newtp
->ts_recent_stamp
= xtime
.tv_sec
;
2823 newtp
->tcp_header_len
= sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
;
2825 newtp
->ts_recent_stamp
= 0;
2826 newtp
->tcp_header_len
= sizeof(struct tcphdr
);
2828 newtp
->mss_clamp
= req
->mss
;
2834 * Process an incoming packet for SYN_RECV sockets represented
2835 * as an open_request.
2838 struct sock
*tcp_check_req(struct sock
*sk
,struct sk_buff
*skb
,
2839 struct open_request
*req
,
2840 struct open_request
**prev
)
2842 struct tcphdr
*th
= skb
->h
.th
;
2843 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2844 u32 flg
= tcp_flag_word(th
) & (TCP_FLAG_RST
|TCP_FLAG_SYN
|TCP_FLAG_ACK
);
2845 int paws_reject
= 0;
2850 if (th
->doff
> (sizeof(struct tcphdr
)>>2)) {
2851 tcp_parse_options(NULL
, th
, &ttp
, 0);
2853 if (ttp
.saw_tstamp
) {
2854 ttp
.ts_recent
= req
->ts_recent
;
2855 /* We do not store true stamp, but it is not required,
2856 * it can be estimated (approximately)
2857 * from another data.
2859 ttp
.ts_recent_stamp
= xtime
.tv_sec
- ((TCP_TIMEOUT_INIT
/HZ
)<<req
->retrans
);
2860 paws_reject
= tcp_paws_check(&ttp
, th
->rst
);
2864 /* Check for pure retransmited SYN. */
2865 if (TCP_SKB_CB(skb
)->seq
== req
->rcv_isn
&&
2866 flg
== TCP_FLAG_SYN
&&
2869 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2870 * this case on figure 6 and figure 8, but formal
2871 * protocol description says NOTHING.
2872 * To be more exact, it says that we should send ACK,
2873 * because this segment (at least, if it has no data)
2876 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2877 * describe SYN-RECV state. All the description
2878 * is wrong, we cannot believe to it and should
2879 * rely only on common sense and implementation
2882 * Enforce "SYN-ACK" according to figure 8, figure 6
2883 * of RFC793, fixed by RFC1122.
2885 req
->class->rtx_syn_ack(sk
, req
, NULL
);
2889 /* Further reproduces section "SEGMENT ARRIVES"
2890 for state SYN-RECEIVED of RFC793.
2891 It is broken, however, it does not work only
2892 when SYNs are crossed, which is impossible in our
2895 But generally, we should (RFC lies!) to accept ACK
2896 from SYNACK both here and in tcp_rcv_state_process().
2897 tcp_rcv_state_process() does not, hence, we do not too.
2899 Note that the case is absolutely generic:
2900 we cannot optimize anything here without
2901 violating protocol. All the checks must be made
2902 before attempt to create socket.
2905 /* RFC793: "first check sequence number". */
2907 if (paws_reject
|| !tcp_in_window(TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
,
2908 req
->rcv_isn
+1, req
->rcv_isn
+1+req
->rcv_wnd
)) {
2909 /* Out of window: send ACK and drop. */
2910 if (!(flg
& TCP_FLAG_RST
))
2911 req
->class->send_ack(skb
, req
);
2913 NET_INC_STATS_BH(PAWSEstabRejected
);
2917 /* In sequence, PAWS is OK. */
2919 if (ttp
.saw_tstamp
&& !after(TCP_SKB_CB(skb
)->seq
, req
->rcv_isn
+1))
2920 req
->ts_recent
= ttp
.rcv_tsval
;
2922 if (TCP_SKB_CB(skb
)->seq
== req
->rcv_isn
) {
2923 /* Truncate SYN, it is out of window starting
2924 at req->rcv_isn+1. */
2925 flg
&= ~TCP_FLAG_SYN
;
2928 /* RFC793: "second check the RST bit" and
2929 * "fourth, check the SYN bit"
2931 if (flg
& (TCP_FLAG_RST
|TCP_FLAG_SYN
))
2932 goto embryonic_reset
;
2934 /* RFC793: "fifth check the ACK field" */
2936 if (!(flg
& TCP_FLAG_ACK
))
2939 /* Invalid ACK: reset will be sent by listening socket */
2940 if (TCP_SKB_CB(skb
)->ack_seq
!= req
->snt_isn
+1)
2942 /* Also, it would be not so bad idea to check rcv_tsecr, which
2943 * is essentially ACK extension and too early or too late values
2944 * should cause reset in unsynchronized states.
2947 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
2948 if (tp
->defer_accept
&& TCP_SKB_CB(skb
)->end_seq
== req
->rcv_isn
+1) {
2953 /* OK, ACK is valid, create big socket and
2954 * feed this segment to it. It will repeat all
2955 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
2956 * ESTABLISHED STATE. If it will be dropped after
2957 * socket is created, wait for troubles.
2959 child
= tp
->af_specific
->syn_recv_sock(sk
, skb
, req
, NULL
);
2961 goto listen_overflow
;
2963 tcp_synq_unlink(tp
, req
, prev
);
2964 tcp_synq_removed(sk
, req
);
2966 tcp_acceptq_queue(sk
, req
, child
);
2970 if (!sysctl_tcp_abort_on_overflow
) {
2976 NET_INC_STATS_BH(EmbryonicRsts
);
2977 if (!(flg
& TCP_FLAG_RST
))
2978 req
->class->send_reset(skb
);
2980 tcp_synq_drop(sk
, req
, prev
);
2985 * Queue segment on the new socket if the new socket is active,
2986 * otherwise we just shortcircuit this and continue with
2990 int tcp_child_process(struct sock
*parent
, struct sock
*child
,
2991 struct sk_buff
*skb
)
2994 int state
= child
->state
;
2996 if (child
->lock
.users
== 0) {
2997 ret
= tcp_rcv_state_process(child
, skb
, skb
->h
.th
, skb
->len
);
2999 /* Wakeup parent, send SIGIO */
3000 if (state
== TCP_SYN_RECV
&& child
->state
!= state
)
3001 parent
->data_ready(parent
, 0);
3003 /* Alas, it is possible again, because we do lookup
3004 * in main socket hash table and lock on listening
3005 * socket does not protect us more.
3007 sk_add_backlog(child
, skb
);
3010 bh_unlock_sock(child
);
3014 static int tcp_rcv_synsent_state_process(struct sock
*sk
, struct sk_buff
*skb
,
3015 struct tcphdr
*th
, unsigned len
)
3017 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
3019 tcp_parse_options(sk
, th
, tp
, 0);
3023 * "If the state is SYN-SENT then
3024 * first check the ACK bit
3025 * If the ACK bit is set
3026 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
3027 * a reset (unless the RST bit is set, if so drop
3028 * the segment and return)"
3030 * I cite this place to emphasize one essential
3031 * detail, this check is different of one
3032 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
3033 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
3034 * because we have no previous data sent before SYN.
3037 * We do not send data with SYN, so that RFC-correct
3040 if (TCP_SKB_CB(skb
)->ack_seq
!= tp
->snd_nxt
)
3043 /* Check not from any RFC, but it is evident consequence
3044 * of combining PAWS and usual SYN-SENT logic: ACK _is_
3045 * checked in SYN-SENT unlike another states, hence
3046 * echoed tstamp must be checked too.
3048 if (tp
->saw_tstamp
) {
3049 if (tp
->rcv_tsecr
== 0) {
3050 /* Workaround for bug in linux-2.1 and early
3051 * 2.2 kernels. Let's pretend that we did not
3052 * see such timestamp to avoid bogus rtt value,
3053 * calculated by tcp_ack().
3057 /* But do not forget to store peer's timestamp! */
3059 tp
->ts_recent
= tp
->rcv_tsval
;
3060 tp
->ts_recent_stamp
= xtime
.tv_sec
;
3062 } else if ((__s32
)(tp
->rcv_tsecr
- tcp_time_stamp
) > 0 ||
3063 (__s32
)(tp
->rcv_tsecr
- tp
->syn_stamp
) < 0) {
3064 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG
"TCP: synsent reject.\n"));
3065 NET_INC_STATS_BH(PAWSActiveRejected
);
3070 /* Now ACK is acceptable.
3072 * "If the RST bit is set
3073 * If the ACK was acceptable then signal the user "error:
3074 * connection reset", drop the segment, enter CLOSED state,
3075 * delete TCB, and return."
3084 * "fifth, if neither of the SYN or RST bits is set then
3085 * drop the segment and return."
3094 * "If the SYN bit is on ...
3095 * are acceptable then ...
3096 * (our SYN has been ACKed), change the connection
3097 * state to ESTABLISHED..."
3099 * Do you see? SYN-less ACKs in SYN-SENT state are
3100 * completely ignored.
3102 * The bug causing stalled SYN-SENT sockets
3103 * was here: tcp_ack advanced snd_una and canceled
3104 * retransmit timer, so that bare ACK received
3105 * in SYN-SENT state (even with invalid ack==ISS,
3106 * because tcp_ack check is too weak for SYN-SENT)
3107 * causes moving socket to invalid semi-SYN-SENT,
3108 * semi-ESTABLISHED state and connection hangs.
3111 * Bare ACK is valid, however.
3112 * Actually, RFC793 requires to send such ACK
3113 * in reply to any out of window packet.
3114 * It is wrong, but Linux also send such
3115 * useless ACKs sometimes.
3119 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
3120 tcp_ack(sk
,th
, TCP_SKB_CB(skb
)->seq
,
3121 TCP_SKB_CB(skb
)->ack_seq
, len
);
3123 /* Ok.. it's good. Set up sequence numbers and
3124 * move to established.
3126 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->seq
+1;
3127 tp
->rcv_wup
= TCP_SKB_CB(skb
)->seq
+1;
3129 /* RFC1323: The window in SYN & SYN/ACK segments is
3132 tp
->snd_wnd
= ntohs(th
->window
);
3133 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
3134 tp
->snd_wl2
= TCP_SKB_CB(skb
)->ack_seq
;
3135 tp
->fin_seq
= TCP_SKB_CB(skb
)->seq
;
3137 tcp_set_state(sk
, TCP_ESTABLISHED
);
3139 if (tp
->wscale_ok
== 0) {
3140 tp
->snd_wscale
= tp
->rcv_wscale
= 0;
3141 tp
->window_clamp
= min(tp
->window_clamp
,65535);
3144 if (tp
->tstamp_ok
) {
3145 tp
->tcp_header_len
=
3146 sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
;
3148 tp
->tcp_header_len
= sizeof(struct tcphdr
);
3149 if (tp
->saw_tstamp
) {
3150 tp
->ts_recent
= tp
->rcv_tsval
;
3151 tp
->ts_recent_stamp
= xtime
.tv_sec
;
3153 tcp_sync_mss(sk
, tp
->pmtu_cookie
);
3154 tcp_initialize_rcv_mss(sk
);
3155 tcp_init_metrics(sk
);
3156 tcp_init_buffer_space(sk
);
3159 tcp_reset_keepalive_timer(sk
, keepalive_time_when(tp
));
3161 tp
->copied_seq
= tp
->rcv_nxt
;
3162 __tcp_fast_path_on(tp
, tp
->snd_wnd
);
3165 sk
->state_change(sk
);
3166 sk_wake_async(sk
, 0, POLL_OUT
);
3169 if (tp
->write_pending
) {
3170 /* Save one ACK. Data will be ready after
3171 * several ticks, if write_pending is set.
3173 * It may be deleted, but with this feature tcpdumps
3174 * look so _wonderfully_ clever, that I was not able
3175 * to stand against the temptation 8) --ANK
3177 tp
->ack
.pending
= 1;
3178 tp
->ack
.lrcvtime
= tcp_time_stamp
;
3179 tcp_enter_quickack_mode(tp
);
3180 tp
->ack
.ato
= TCP_ATO_MIN
;
3181 tcp_reset_xmit_timer(sk
, TCP_TIME_DACK
, TCP_DELACK_MIN
);
3189 /* No ACK in the segment */
3193 * "If the RST bit is set
3195 * Otherwise (no ACK) drop the segment and return."
3202 if (tp
->ts_recent_stamp
&& tp
->saw_tstamp
&& tcp_paws_check(tp
, 0))
3206 /* We see SYN without ACK. It is attempt of
3207 * simultaneous connect with crossed SYNs.
3209 * The previous version of the code
3210 * checked for "connecting to self"
3211 * here. that check is done now in
3214 * RED-PEN: BTW, it does not. 8)
3216 tcp_set_state(sk
, TCP_SYN_RECV
);
3217 if (tp
->saw_tstamp
) {
3218 tp
->ts_recent
= tp
->rcv_tsval
;
3219 tp
->ts_recent_stamp
= xtime
.tv_sec
;
3222 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->seq
+ 1;
3223 tp
->rcv_wup
= TCP_SKB_CB(skb
)->seq
+ 1;
3225 /* RFC1323: The window in SYN & SYN/ACK segments is
3228 tp
->snd_wnd
= ntohs(th
->window
);
3229 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
3230 tp
->max_window
= tp
->snd_wnd
;
3232 tcp_sync_mss(sk
, tp
->pmtu_cookie
);
3233 tcp_initialize_rcv_mss(sk
);
3235 tcp_send_synack(sk
);
3237 /* Note, we could accept data and URG from this segment.
3238 * There are no obstacles to make this.
3240 * However, if we ignore data in ACKless segments sometimes,
3241 * we have no reasons to accept it sometimes.
3242 * Also, seems the code doing it in step6 of tcp_rcv_state_process
3243 * is not flawless. So, discard packet for sanity.
3244 * Uncomment this return to process the data.
3249 /* "fifth, if neither of the SYN or RST bits is set then
3250 * drop the segment and return."
3260 * This function implements the receiving procedure of RFC 793 for
3261 * all states except ESTABLISHED and TIME_WAIT.
3262 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
3263 * address independent.
3266 int tcp_rcv_state_process(struct sock
*sk
, struct sk_buff
*skb
,
3267 struct tcphdr
*th
, unsigned len
)
3269 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
3274 switch (sk
->state
) {
3276 /* When state == CLOSED, hash lookup always fails.
3278 * But, there is a back door, the backlog queue.
3279 * If we have a sequence of packets in the backlog
3280 * during __release_sock() which have a sequence such
3282 * packet X causes entry to TCP_CLOSE state
3284 * packet X + N has FIN bit set
3286 * We report a (luckily) harmless error in this case.
3287 * The issue is that backlog queue processing bypasses
3288 * any hash lookups (we know which socket packets are for).
3289 * The correct behavior here is what 2.0.x did, since
3290 * a TCP_CLOSE socket does not exist. Drop the frame
3291 * and send a RST back to the other end.
3294 /* 1. The socket may be moved to TIME-WAIT state.
3295 2. While this socket was locked, another socket
3296 with the same identity could be created.
3299 CONCLUSION: discard and only discard!
3301 Alternative would be relookup and recurse into tcp_v?_rcv
3302 (not *_do_rcv) to work with timewait and listen states
3312 if(tp
->af_specific
->conn_request(sk
, skb
) < 0)
3315 /* Now we have several options: In theory there is
3316 * nothing else in the frame. KA9Q has an option to
3317 * send data with the syn, BSD accepts data with the
3318 * syn up to the [to be] advertised window and
3319 * Solaris 2.1 gives you a protocol error. For now
3320 * we just ignore it, that fits the spec precisely
3321 * and avoids incompatibilities. It would be nice in
3322 * future to drop through and process the data.
3324 * Now that TTCP is starting to be used we ought to
3326 * But, this leaves one open to an easy denial of
3327 * service attack, and SYN cookies can't defend
3328 * against this problem. So, we drop the data
3329 * in the interest of security over speed.
3336 queued
= tcp_rcv_synsent_state_process(sk
, skb
, th
, len
);
3343 /* Parse the tcp_options present on this header.
3344 * By this point we really only expect timestamps.
3345 * Note that this really has to be here and not later for PAWS
3346 * (RFC1323) to work.
3348 if (tcp_fast_parse_options(sk
, th
, tp
) && tp
->saw_tstamp
&&
3349 tcp_paws_discard(tp
, skb
)) {
3354 /* Reset is accepted even if it did not pass PAWS. */
3357 /* The silly FIN test here is necessary to see an advancing ACK in
3358 * retransmitted FIN frames properly. Consider the following sequence:
3360 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
3361 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
3362 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
3363 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
3365 * At this point the connection will deadlock with host1 believing
3366 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
3367 * forever. The following fix is from Taral (taral@taral.net).
3369 * RED-PEN. Seems, the above is not true.
3370 * If at least one end is RFC compliant, it will send ACK to
3371 * out of window FIN and, hence, move peer to TIME-WAIT.
3372 * I comment out this line. --ANK
3374 * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
3375 * received in SYN-RECV. The problem is that description of
3376 * segment processing in SYN-RECV state in RFC792 is WRONG.
3377 * Correct check would accept ACK from this SYN-ACK, see
3378 * figures 6 and 8 (fixed by RFC1122). Compare this
3379 * to problem with FIN, they smell similarly. --ANK
3382 /* step 1: check sequence number */
3383 if (!tcp_sequence(tp
, TCP_SKB_CB(skb
)->seq
, TCP_SKB_CB(skb
)->end_seq
)
3385 && !(th
->fin
&& TCP_SKB_CB(skb
)->end_seq
== tp
->rcv_nxt
)
3389 NET_INC_STATS_BH(DelayedACKLost
);
3390 tcp_enter_quickack_mode(tp
);
3396 /* step 2: check RST bit */
3402 if (tp
->saw_tstamp
) {
3403 tcp_replace_ts_recent(sk
, tp
,
3404 TCP_SKB_CB(skb
)->seq
);
3407 /* step 3: check security and precedence [ignored] */
3411 * Check for a SYN, and ensure it matches the SYN we were
3412 * first sent. We have to handle the rather unusual (but valid)
3413 * sequence that KA9Q derived products may generate of
3418 * SYN|ACK Data + More Data
3419 * .. we must ACK not RST...
3421 * We keep syn_seq as the sequence space occupied by the
3425 if (th
->syn
&& TCP_SKB_CB(skb
)->seq
!= tp
->syn_seq
) {
3430 /* step 5: check the ACK field */
3432 int acceptable
= tcp_ack(sk
, th
, TCP_SKB_CB(skb
)->seq
,
3433 TCP_SKB_CB(skb
)->ack_seq
, len
);
3438 tcp_set_state(sk
, TCP_ESTABLISHED
);
3439 tp
->copied_seq
= tp
->rcv_nxt
;
3441 /* Note, that this wakeup is only for marginal
3442 * crossed SYN case. Passively open sockets
3443 * are not waked up, because sk->sleep == NULL
3444 * and sk->socket == NULL.
3447 sk
->state_change(sk
);
3448 sk_wake_async(sk
,0,POLL_OUT
);
3451 tp
->snd_una
= TCP_SKB_CB(skb
)->ack_seq
;
3452 tp
->snd_wnd
= ntohs(th
->window
) << tp
->snd_wscale
;
3453 tp
->snd_wl1
= TCP_SKB_CB(skb
)->seq
;
3454 tp
->snd_wl2
= TCP_SKB_CB(skb
)->ack_seq
;
3456 /* tcp_ack considers this ACK as duplicate
3457 * and does not calculate rtt.
3458 * Fix it at least with timestamps.
3460 if (tp
->saw_tstamp
&& !tp
->srtt
)
3461 tcp_ack_saw_tstamp(sk
, tp
, 0, 0, FLAG_SYN_ACKED
);
3463 tcp_init_metrics(sk
);
3464 tcp_fast_path_on(tp
);
3466 SOCK_DEBUG(sk
, "bad ack\n");
3472 if (tp
->snd_una
== tp
->write_seq
) {
3473 tcp_set_state(sk
, TCP_FIN_WAIT2
);
3474 sk
->shutdown
|= SEND_SHUTDOWN
;
3475 dst_confirm(sk
->dst_cache
);
3478 /* Wake up lingering close() */
3479 sk
->state_change(sk
);
3483 if (tp
->linger2
< 0 ||
3484 (TCP_SKB_CB(skb
)->end_seq
!= TCP_SKB_CB(skb
)->seq
&&
3485 after(TCP_SKB_CB(skb
)->end_seq
- th
->fin
, tp
->rcv_nxt
))) {
3490 tmo
= tcp_fin_time(tp
);
3491 if (tmo
> TCP_TIMEWAIT_LEN
) {
3492 tcp_reset_keepalive_timer(sk
, tmo
- TCP_TIMEWAIT_LEN
);
3493 } else if (th
->fin
|| sk
->lock
.users
) {
3494 /* Bad case. We could lose such FIN otherwise.
3495 * It is not a big problem, but it looks confusing
3496 * and not so rare event. We still can lose it now,
3497 * if it spins in bh_lock_sock(), but it is really
3500 tcp_reset_keepalive_timer(sk
, tmo
);
3502 tcp_time_wait(sk
, TCP_FIN_WAIT2
, tmo
);
3510 if (tp
->snd_una
== tp
->write_seq
) {
3511 tcp_time_wait(sk
, TCP_TIME_WAIT
, 0);
3517 if (tp
->snd_una
== tp
->write_seq
) {
3518 tcp_update_metrics(sk
);
3528 /* step 6: check the URG bit */
3529 tcp_urg(sk
, th
, len
);
3531 /* step 7: process the segment text */
3532 switch (sk
->state
) {
3533 case TCP_CLOSE_WAIT
:
3535 if (!before(TCP_SKB_CB(skb
)->seq
, tp
->fin_seq
))
3539 /* RFC 793 says to queue data in these states,
3540 * RFC 1122 says we MUST send a reset.
3541 * BSD 4.4 also does reset.
3543 if (sk
->shutdown
& RCV_SHUTDOWN
) {
3544 if (TCP_SKB_CB(skb
)->end_seq
!= TCP_SKB_CB(skb
)->seq
&&
3545 after(TCP_SKB_CB(skb
)->end_seq
- th
->fin
, tp
->rcv_nxt
)) {
3551 case TCP_ESTABLISHED
:
3552 tcp_data(skb
, sk
, len
);
3557 /* tcp_data could move socket to TIME-WAIT */
3558 if (sk
->state
!= TCP_CLOSE
) {
3559 tcp_data_snd_check(sk
);
3560 tcp_ack_snd_check(sk
);