Import 2.3.99pre6-6
[davej-history.git] / net / ipv4 / tcp_input.c
blobf062cb2fb1427bbb07def7c33b53a3e879e35ce9
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.193 2000/04/20 14:41:16 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
60 #include <linux/config.h>
61 #include <linux/mm.h>
62 #include <linux/sysctl.h>
63 #include <net/tcp.h>
64 #include <net/inet_common.h>
65 #include <linux/ipsec.h>
67 #ifdef CONFIG_SYSCTL
68 #define SYNC_INIT 0 /* let the user enable it */
69 #else
70 #define SYNC_INIT 1
71 #endif
73 /* These are on by default so the code paths get tested.
74 * For the final 2.2 this may be undone at our discretion. -DaveM
76 int sysctl_tcp_timestamps = 1;
77 int sysctl_tcp_window_scaling = 1;
78 int sysctl_tcp_sack = 1;
80 int sysctl_tcp_syncookies = SYNC_INIT;
81 int sysctl_tcp_stdurg;
82 int sysctl_tcp_rfc1337;
83 int sysctl_tcp_tw_recycle = 1;
84 int sysctl_tcp_abort_on_overflow = 0;
85 int sysctl_tcp_max_orphans = NR_FILE;
86 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
88 static int prune_queue(struct sock *sk);
90 /*
91 * Adapt the MSS value used to make delayed ack decision to the
92 * real world.
94 * The constant 536 hasn't any good meaning. In IPv4 world
95 * MTU may be smaller, though it contradicts to RFC1122, which
96 * states that MSS must be at least 536.
97 * We use the constant to do not ACK each second
98 * packet in a stream of tiny size packets.
99 * It means that super-low mtu links will be aggressively delacked.
100 * Seems, it is even good. If they have so low mtu, they are weirdly
101 * slow.
103 * AK: BTW it may be useful to add an option to lock the rcv_mss.
104 * this way the beowulf people wouldn't need ugly patches to get the
105 * ack frequencies they want and it would be an elegant way to tune delack.
107 static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb)
109 unsigned int len, lss;
111 lss = tp->ack.last_seg_size;
112 tp->ack.last_seg_size = 0;
114 /* skb->len may jitter because of SACKs, even if peer
115 * sends good full-sized frames.
117 len = skb->len;
118 if (len >= tp->ack.rcv_mss) {
119 tp->ack.rcv_mss = len;
120 } else {
121 /* Otherwise, we make more careful check taking into account,
122 * that SACKs block is variable.
124 * "len" is invariant segment length, including TCP header.
126 len = skb->tail - skb->h.raw;
127 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) {
128 /* Subtract also invariant (if peer is RFC compliant),
129 * tcp header plus fixed timestamp option length.
130 * Resulting "len" is MSS free of SACK jitter.
132 len -= tp->tcp_header_len;
133 if (len == lss)
134 tp->ack.rcv_mss = len;
135 tp->ack.last_seg_size = len;
141 static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp)
143 unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss);
145 tp->ack.quick = max(min(quickacks, 127), 1);
147 if (!tp->tstamp_ok && tp->ack.quick>2) {
148 /* Quick ACKs are _dangerous_, if RTTM is not used.
149 * See comment in tcp_init_metrics(). We still help
150 * them to overcome the most difficult, initial
151 * phase of slow start.
153 tp->ack.quick = 2;
157 /* Send ACKs quickly, if "quick" count is not ehausted
158 * and the session is not interactive.
161 static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp)
163 return (tp->ack.quick && !tp->ack.pingpong);
166 /* There is something which you must keep in mind when you analyze the
167 * behavior of the tp->ato delayed ack timeout interval. When a
168 * connection starts up, we want to ack as quickly as possible. The
169 * problem is that "good" TCP's do slow start at the beginning of data
170 * transmission. The means that until we send the first few ACK's the
171 * sender will sit on his end and only queue most of his data, because
172 * he can only send snd_cwnd unacked packets at any given time. For
173 * each ACK we send, he increments snd_cwnd and transmits more of his
174 * queue. -DaveM
176 static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb)
178 u32 now;
180 tcp_measure_rcv_mss(tp, skb);
182 tp->ack.pending = 1;
183 tp->ack.rcv_segs++;
185 now = tcp_time_stamp;
187 if (!tp->ack.ato) {
188 /* The _first_ data packet received, initialize
189 * delayed ACK engine.
192 /* Help sender leave slow start quickly. */
193 tcp_enter_quickack_mode(tp);
195 /* Pingpong is off, session is not interactive by default */
196 tp->ack.pingpong = 0;
198 /* ATO is minimal */
199 tp->ack.ato = TCP_ATO_MIN;
200 } else {
201 int m = now - tp->ack.lrcvtime;
203 if (m > TCP_ATO_MAX/2) {
204 /* Do not touch ATO, if interval is out of bounds.
205 * It will be deflated by delack timer, if our peer
206 * really sends too rarely.
208 if (m > tp->rto) {
209 /* Too long gap. Apparently sender falled to
210 * restart window, so that we send ACKs quickly.
212 tcp_enter_quickack_mode(tp);
214 } else {
215 if (m <= 0)
216 m = TCP_ATO_MIN/2;
217 if (m <= tp->ack.ato)
218 tp->ack.ato = (tp->ack.ato >> 1) + m;
221 tp->ack.lrcvtime = now;
224 /* Called to compute a smoothed rtt estimate. The data fed to this
225 * routine either comes from timestamps, or from segments that were
226 * known _not_ to have been retransmitted [see Karn/Partridge
227 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
228 * piece by Van Jacobson.
229 * NOTE: the next three routines used to be one big routine.
230 * To save cycles in the RFC 1323 implementation it was better to break
231 * it up into three procedures. -- erics
234 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
236 long m = mrtt; /* RTT */
238 /* The following amusing code comes from Jacobson's
239 * article in SIGCOMM '88. Note that rtt and mdev
240 * are scaled versions of rtt and mean deviation.
241 * This is designed to be as fast as possible
242 * m stands for "measurement".
244 * On a 1990 paper the rto value is changed to:
245 * RTO = rtt + 4 * mdev
247 if(m == 0)
248 m = 1;
249 if (tp->srtt != 0) {
250 m -= (tp->srtt >> 3); /* m is now error in rtt est */
251 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
252 if (m < 0)
253 m = -m; /* m is now abs(error) */
254 m -= (tp->mdev >> 2); /* similar update on mdev */
255 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
256 } else {
257 /* no previous measure. */
258 tp->srtt = m<<3; /* take the measured time to be rtt */
259 tp->mdev = m<<2; /* make sure rto = 3*rtt */
263 /* Calculate rto without backoff. This is the second half of Van Jacobson's
264 * routine referred to above.
267 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
269 tp->rto = (tp->srtt >> 3) + tp->mdev;
270 /* I am not enough educated to understand this magic.
271 * However, it smells bad. snd_cwnd>31 is common case.
273 tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
277 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
278 * on packet lifetime in the internet. We need the HZ/5 lower
279 * bound to behave correctly against BSD stacks with a fixed
280 * delayed ack.
281 * FIXME: It's not entirely clear this lower bound is the best
282 * way to avoid the problem. Is it possible to drop the lower
283 * bound and still avoid trouble with BSD stacks? Perhaps
284 * some modification to the RTO calculation that takes delayed
285 * ack bias into account? This needs serious thought. -- erics
287 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
289 if (tp->rto < TCP_RTO_MIN)
290 tp->rto = TCP_RTO_MIN;
291 else if (tp->rto > TCP_RTO_MAX)
292 tp->rto = TCP_RTO_MAX;
295 /* Save metrics learned by this TCP session.
296 This function is called only, when TCP finishes sucessfully
297 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
299 static void tcp_update_metrics(struct sock *sk)
301 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
302 struct dst_entry *dst = __sk_dst_get(sk);
304 dst_confirm(dst);
306 if (dst && (dst->flags&DST_HOST)) {
307 int m;
309 if (tp->backoff || !tp->srtt) {
310 /* This session failed to estimate rtt. Why?
311 * Probably, no packets returned in time.
312 * Reset our results.
314 if (!(dst->mxlock&(1<<RTAX_RTT)))
315 dst->rtt = 0;
316 return;
319 m = dst->rtt - tp->srtt;
321 /* If newly calculated rtt larger than stored one,
322 * store new one. Otherwise, use EWMA. Remember,
323 * rtt overestimation is always better than underestimation.
325 if (!(dst->mxlock&(1<<RTAX_RTT))) {
326 if (m <= 0)
327 dst->rtt = tp->srtt;
328 else
329 dst->rtt -= (m>>3);
332 if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
333 if (m < 0)
334 m = -m;
336 /* Scale deviation to rttvar fixed point */
337 m >>= 1;
338 if (m < tp->mdev)
339 m = tp->mdev;
341 if (m >= dst->rttvar)
342 dst->rttvar = m;
343 else
344 dst->rttvar -= (dst->rttvar - m)>>2;
347 if (tp->snd_ssthresh == 0x7FFFFFFF) {
348 /* Slow start still did not finish. */
349 if (dst->ssthresh &&
350 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
351 tp->snd_cwnd > dst->ssthresh)
352 dst->ssthresh = tp->snd_cwnd;
353 if (!(dst->mxlock&(1<<RTAX_CWND)) &&
354 tp->snd_cwnd > dst->cwnd)
355 dst->cwnd = tp->snd_cwnd;
356 } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
357 /* Cong. avoidance phase, cwnd is reliable. */
358 if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
359 dst->ssthresh = tp->snd_cwnd;
360 if (!(dst->mxlock&(1<<RTAX_CWND)))
361 dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
362 } else {
363 /* Else slow start did not finish, cwnd is non-sense,
364 ssthresh may be also invalid.
366 if (!(dst->mxlock&(1<<RTAX_CWND)))
367 dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
368 if (dst->ssthresh &&
369 !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
370 tp->snd_ssthresh > dst->ssthresh)
371 dst->ssthresh = tp->snd_ssthresh;
376 /* Initialize metrics on socket. */
378 static void tcp_init_metrics(struct sock *sk)
380 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
381 struct dst_entry *dst = __sk_dst_get(sk);
383 if (dst == NULL)
384 goto reset;
386 dst_confirm(dst);
388 if (dst->mxlock&(1<<RTAX_CWND))
389 tp->snd_cwnd_clamp = dst->cwnd;
390 if (dst->ssthresh) {
391 tp->snd_ssthresh = dst->ssthresh;
392 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
393 tp->snd_ssthresh = tp->snd_cwnd_clamp;
396 if (dst->rtt == 0)
397 goto reset;
399 if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
400 goto reset;
402 /* Initial rtt is determined from SYN,SYN-ACK.
403 * The segment is small and rtt may appear much
404 * less than real one. Use per-dst memory
405 * to make it more realistic.
407 * A bit of theory. RTT is time passed after "normal" sized packet
408 * is sent until it is ACKed. In normal curcumstances sending small
409 * packets force peer to delay ACKs and calculation is correct too.
410 * The algorithm is adaptive and, provided we follow specs, it
411 * NEVER underestimate RTT. BUT! If peer tries to make some clever
412 * tricks sort of "quick acks" for time long enough to decrease RTT
413 * to low value, and then abruptly stops to do it and starts to delay
414 * ACKs, wait for troubles.
416 if (dst->rtt > tp->srtt)
417 tp->srtt = dst->rtt;
418 if (dst->rttvar > tp->mdev)
419 tp->mdev = dst->rttvar;
420 tcp_set_rto(tp);
421 tcp_bound_rto(tp);
422 if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
423 goto reset;
424 tp->snd_cwnd = tcp_init_cwnd(tp);
425 return;
428 reset:
429 /* Play conservative. If timestamps are not
430 * supported, TCP will fail to recalculate correct
431 * rtt, if initial rto is too small. FORGET ALL AND RESET!
433 if (!tp->saw_tstamp && tp->srtt) {
434 tp->srtt = 0;
435 tp->mdev = TCP_TIMEOUT_INIT;
436 tp->rto = TCP_TIMEOUT_INIT;
440 /* WARNING: this must not be called if tp->saw_tstamp was false. */
441 extern __inline__ void
442 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
444 if (!after(seq, tp->rcv_wup)) {
445 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
446 * extra check below makes sure this can only happen
447 * for pure ACK frames. -DaveM
449 * Not only, also it occurs for expired timestamps
450 * and RSTs with bad timestamp option. --ANK
453 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
454 xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) {
455 tp->ts_recent = tp->rcv_tsval;
456 tp->ts_recent_stamp = xtime.tv_sec;
461 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
463 return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
464 xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS
466 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
468 I cannot see quitely as all the idea behind PAWS
469 is destroyed 8)
471 The problem is only in reordering duplicate ACKs.
472 Hence, we can check this rare case more carefully.
474 1. Check that it is really duplicate ACK (ack==snd_una)
475 2. Give it some small "replay" window (~RTO)
477 We do not know units of foreign ts values, but make conservative
478 assumption that they are >=1ms. It solves problem
479 noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
481 && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
482 TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
483 !skb->h.th->ack ||
484 (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
488 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
490 u32 end_window = tp->rcv_wup + tp->rcv_wnd;
491 #ifdef TCP_FORMAL_WINDOW
492 u32 rcv_wnd = tcp_receive_window(tp);
493 #else
494 u32 rcv_wnd = tp->rcv_wnd;
495 #endif
497 if (rcv_wnd &&
498 after(end_seq, tp->rcv_nxt) &&
499 before(seq, end_window))
500 return 1;
501 if (seq != end_window)
502 return 0;
503 return (seq == end_seq);
506 /* This functions checks to see if the tcp header is actually acceptable. */
507 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
509 #ifdef TCP_FORMAL_WINDOW
510 u32 rcv_wnd = tcp_receive_window(tp);
511 #else
512 u32 rcv_wnd = tp->rcv_wnd;
513 #endif
514 if (seq == tp->rcv_nxt)
515 return (rcv_wnd || (end_seq == seq));
517 return __tcp_sequence(tp, seq, end_seq);
520 /* When we get a reset we do this. */
521 static void tcp_reset(struct sock *sk)
523 /* We want the right error as BSD sees it (and indeed as we do). */
524 switch (sk->state) {
525 case TCP_SYN_SENT:
526 sk->err = ECONNREFUSED;
527 break;
528 case TCP_CLOSE_WAIT:
529 sk->err = EPIPE;
530 break;
531 case TCP_CLOSE:
532 return;
533 default:
534 sk->err = ECONNRESET;
537 if (!sk->dead)
538 sk->error_report(sk);
540 tcp_done(sk);
543 /* This tags the retransmission queue when SACKs arrive. */
544 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
546 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
547 int i = nsacks;
549 while(i--) {
550 struct sk_buff *skb = skb_peek(&sk->write_queue);
551 __u32 start_seq = ntohl(sp->start_seq);
552 __u32 end_seq = ntohl(sp->end_seq);
553 int fack_count = 0;
555 while((skb != NULL) &&
556 (skb != tp->send_head) &&
557 (skb != (struct sk_buff *)&sk->write_queue)) {
558 /* The retransmission queue is always in order, so
559 * we can short-circuit the walk early.
561 if(after(TCP_SKB_CB(skb)->seq, end_seq))
562 break;
564 /* We play conservative, we don't allow SACKS to partially
565 * tag a sequence space.
567 fack_count++;
568 if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
569 !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
570 /* If this was a retransmitted frame, account for it. */
571 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
572 tp->retrans_out)
573 tp->retrans_out--;
574 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
576 /* RULE: All new SACKs will either decrease retrans_out
577 * or advance fackets_out.
579 if(fack_count > tp->fackets_out)
580 tp->fackets_out = fack_count;
582 skb = skb->next;
584 sp++; /* Move on to the next SACK block. */
588 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
589 * But, this can also be called on packets in the established flow when
590 * the fast version below fails.
592 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
594 unsigned char *ptr;
595 int length=(th->doff*4)-sizeof(struct tcphdr);
597 ptr = (unsigned char *)(th + 1);
598 tp->saw_tstamp = 0;
600 while(length>0) {
601 int opcode=*ptr++;
602 int opsize;
604 switch (opcode) {
605 case TCPOPT_EOL:
606 return;
607 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
608 length--;
609 continue;
610 default:
611 opsize=*ptr++;
612 if (opsize < 2) /* "silly options" */
613 return;
614 if (opsize > length)
615 break; /* don't parse partial options */
616 switch(opcode) {
617 case TCPOPT_MSS:
618 if(opsize==TCPOLEN_MSS && th->syn) {
619 u16 in_mss = ntohs(*(__u16 *)ptr);
620 if (in_mss) {
621 if (tp->user_mss && tp->user_mss < in_mss)
622 in_mss = tp->user_mss;
623 tp->mss_clamp = in_mss;
626 break;
627 case TCPOPT_WINDOW:
628 if(opsize==TCPOLEN_WINDOW && th->syn)
629 if (!no_fancy && sysctl_tcp_window_scaling) {
630 tp->wscale_ok = 1;
631 tp->snd_wscale = *(__u8 *)ptr;
632 if(tp->snd_wscale > 14) {
633 if(net_ratelimit())
634 printk("tcp_parse_options: Illegal window "
635 "scaling value %d >14 received.",
636 tp->snd_wscale);
637 tp->snd_wscale = 14;
640 break;
641 case TCPOPT_TIMESTAMP:
642 if(opsize==TCPOLEN_TIMESTAMP) {
643 if (sysctl_tcp_timestamps && !no_fancy) {
644 tp->tstamp_ok = 1;
645 tp->saw_tstamp = 1;
646 tp->rcv_tsval = ntohl(*(__u32 *)ptr);
647 tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
650 break;
651 case TCPOPT_SACK_PERM:
652 if(opsize==TCPOLEN_SACK_PERM && th->syn) {
653 if (sysctl_tcp_sack && !no_fancy) {
654 tp->sack_ok = 1;
655 tp->num_sacks = 0;
658 break;
660 case TCPOPT_SACK:
661 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
662 sysctl_tcp_sack && (sk != NULL) && !th->syn) {
663 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
665 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
666 int num_sacks = sack_bytes >> 3;
667 struct tcp_sack_block *sackp;
669 sackp = (struct tcp_sack_block *)ptr;
670 tcp_sacktag_write_queue(sk, sackp, num_sacks);
674 ptr+=opsize-2;
675 length-=opsize;
680 /* Fast parse options. This hopes to only see timestamps.
681 * If it is wrong it falls back on tcp_parse_options().
683 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
685 /* If we didn't send out any options ignore them all. */
686 if (tp->tcp_header_len == sizeof(struct tcphdr))
687 return 0;
688 if (th->doff == sizeof(struct tcphdr)>>2) {
689 tp->saw_tstamp = 0;
690 return 0;
691 } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
692 __u32 *ptr = (__u32 *)(th + 1);
693 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
694 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
695 tp->saw_tstamp = 1;
696 ++ptr;
697 tp->rcv_tsval = ntohl(*ptr);
698 ++ptr;
699 tp->rcv_tsecr = ntohl(*ptr);
700 return 1;
703 tcp_parse_options(sk, th, tp, 0);
704 return 1;
707 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
708 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
709 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
710 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
711 #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
713 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
715 if (tp->dup_acks > 3)
716 tp->snd_cwnd = (tp->snd_ssthresh);
718 tp->dup_acks = 0;
721 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
722 * retransmit timer fires.
724 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
726 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
728 /* Note: If not_dup is set this implies we got a
729 * data carrying packet or a window update.
730 * This carries no new information about possible
731 * lost packets, so we have to ignore it for the purposes
732 * of counting duplicate acks. Ideally this does not imply we
733 * should stop our fast retransmit phase, more acks may come
734 * later without data to help us. Unfortunately this would make
735 * the code below much more complex. For now if I see such
736 * a packet I clear the fast retransmit phase.
738 if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
739 /* This is the standard reno style fast retransmit branch. */
741 /* 1. When the third duplicate ack is received, set ssthresh
742 * to one half the current congestion window, but no less
743 * than two segments. Retransmit the missing segment.
745 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
746 tp->dup_acks++;
747 if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
748 __tcp_enter_cong_avoid(tp);
749 /* ... and account for 3 ACKs, which are
750 * already received to this time.
752 tp->snd_cwnd += 3;
754 if(!tp->fackets_out)
755 tcp_retransmit_skb(sk,
756 skb_peek(&sk->write_queue));
757 else
758 tcp_fack_retransmit(sk);
759 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
761 } else if (++tp->dup_acks > 3) {
762 /* 2. Each time another duplicate ACK arrives, increment
763 * cwnd by the segment size. [...] Transmit a packet...
765 * Packet transmission will be done on normal flow processing
766 * since we're not in "retransmit mode". We do not use
767 * duplicate ACKs to artificially inflate the congestion
768 * window when doing FACK.
770 if(!tp->fackets_out) {
771 tp->snd_cwnd++;
772 } else {
773 /* Fill any further holes which may have
774 * appeared.
776 * We may want to change this to run every
777 * further multiple-of-3 dup ack increments,
778 * to be more robust against out-of-order
779 * packet delivery. -DaveM
781 tcp_fack_retransmit(sk);
784 } else if (tp->high_seq != 0) {
785 /* In this branch we deal with clearing the Floyd style
786 * block on duplicate fast retransmits, and if requested
787 * we do Hoe style secondary fast retransmits.
789 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
790 /* Once we have acked all the packets up to high_seq
791 * we are done this fast retransmit phase.
792 * Alternatively data arrived. In this case we
793 * Have to abort the fast retransmit attempt.
794 * Note that we do want to accept a window
795 * update since this is expected with Hoe's algorithm.
797 clear_fast_retransmit(tp);
799 /* After we have cleared up to high_seq we can
800 * clear the Floyd style block.
802 if (!before(ack, tp->high_seq)) {
803 tp->high_seq = 0;
804 tp->fackets_out = 0;
806 } else if (tp->dup_acks >= 3) {
807 if (!tp->fackets_out) {
808 /* Hoe Style. We didn't ack the whole
809 * window. Take this as a cue that
810 * another packet was lost and retransmit it.
811 * Don't muck with the congestion window here.
812 * Note that we have to be careful not to
813 * act if this was a window update and it
814 * didn't ack new data, since this does
815 * not indicate a packet left the system.
816 * We can test this by just checking
817 * if ack changed from snd_una, since
818 * the only way to get here without advancing
819 * from snd_una is if this was a window update.
821 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
822 tcp_retransmit_skb(sk,
823 skb_peek(&sk->write_queue));
824 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
826 } else {
827 /* FACK style, fill any remaining holes in
828 * receiver's queue.
830 tcp_fack_retransmit(sk);
836 /* This is Jacobson's slow start and congestion avoidance.
837 * SIGCOMM '88, p. 328.
839 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
841 if (tp->snd_cwnd <= tp->snd_ssthresh) {
842 /* In "safe" area, increase. */
843 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
844 tp->snd_cwnd++;
845 } else {
846 /* In dangerous area, increase slowly.
847 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
849 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
850 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
851 tp->snd_cwnd++;
852 tp->snd_cwnd_cnt=0;
853 } else
854 tp->snd_cwnd_cnt++;
858 /* Remove acknowledged frames from the retransmission queue. */
859 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
860 __u32 *seq, __u32 *seq_rtt)
862 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
863 struct sk_buff *skb;
864 __u32 now = tcp_time_stamp;
865 int acked = 0;
867 /* If we are retransmitting, and this ACK clears up to
868 * the retransmit head, or further, then clear our state.
870 if (tp->retrans_head != NULL &&
871 !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
872 tp->retrans_head = NULL;
874 while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
875 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
876 __u8 sacked = scb->sacked;
878 /* If our packet is before the ack sequence we can
879 * discard it as it's confirmed to have arrived at
880 * the other end.
882 if (after(scb->end_seq, ack))
883 break;
885 /* Initial outgoing SYN's get put onto the write_queue
886 * just like anything else we transmit. It is not
887 * true data, and if we misinform our callers that
888 * this ACK acks real data, we will erroneously exit
889 * connection startup slow start one packet too
890 * quickly. This is severely frowned upon behavior.
892 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
893 tp->retrans_out--;
894 if(!(scb->flags & TCPCB_FLAG_SYN)) {
895 acked |= FLAG_DATA_ACKED;
896 if(sacked & TCPCB_SACKED_RETRANS)
897 acked |= FLAG_RETRANS_DATA_ACKED;
898 if(tp->fackets_out)
899 tp->fackets_out--;
900 } else {
901 acked |= FLAG_SYN_ACKED;
902 /* This is pure paranoia. */
903 tp->retrans_head = NULL;
905 tp->packets_out--;
906 *seq = scb->seq;
907 *seq_rtt = now - scb->when;
908 __skb_unlink(skb, skb->list);
909 kfree_skb(skb);
911 return acked;
914 static void tcp_ack_probe(struct sock *sk, __u32 ack)
916 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
918 /* Was it a usable window open? */
920 if (tp->send_head != NULL) {
921 if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) {
922 tp->backoff = 0;
923 tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
924 /* If packets_out==0, socket must be waked up by
925 * subsequent tcp_data_snd_check(). This function is
926 * not for random using!
928 } else if (!tp->packets_out) {
929 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
930 min(tp->rto << tp->backoff, TCP_RTO_MAX));
935 /* Should we open up the congestion window? */
936 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
938 /* Data must have been acked. */
939 if ((flag & FLAG_DATA_ACKED) == 0)
940 return 0;
942 /* Some of the data acked was retransmitted somehow? */
943 if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
944 /* We advance in all cases except during
945 * non-FACK fast retransmit/recovery.
947 if (tp->fackets_out != 0 ||
948 tp->retransmits != 0)
949 return 1;
951 /* Non-FACK fast retransmit does it's own
952 * congestion window management, don't get
953 * in the way.
955 return 0;
958 /* New non-retransmitted data acked, always advance. */
959 return 1;
962 /* Read draft-ietf-tcplw-high-performance before mucking
963 * with this code. (Superceeds RFC1323)
965 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
966 u32 seq, u32 ack, int flag)
968 __u32 seq_rtt;
970 /* RTTM Rule: A TSecr value received in a segment is used to
971 * update the averaged RTT measurement only if the segment
972 * acknowledges some new data, i.e., only if it advances the
973 * left edge of the send window.
975 * See draft-ietf-tcplw-high-performance-00, section 3.3.
976 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
978 if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
979 return;
981 seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
982 tcp_rtt_estimator(tp, seq_rtt);
983 if (tp->retransmits) {
984 if (tp->packets_out == 0) {
985 tp->retransmits = 0;
986 tp->fackets_out = 0;
987 tp->retrans_out = 0;
988 tp->backoff = 0;
989 tcp_set_rto(tp);
990 } else {
991 /* Still retransmitting, use backoff */
992 tcp_set_rto(tp);
993 tp->rto = tp->rto << tp->backoff;
995 } else {
996 tcp_set_rto(tp);
999 tcp_bound_rto(tp);
1002 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
1004 struct sk_buff *skb = skb_peek(&sk->write_queue);
1006 #ifdef TCP_DEBUG
1007 /* It occured in 2.3, because of racy timers. Namely,
1008 * retransmit timer did not check packets_out and retransmitted
1009 * send_head sometimes and, hence, messed all the write_queue.
1010 * Now it is impossible, I bet. --ANK
1012 if (skb == NULL) {
1013 printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state);
1014 return;
1016 #endif
1018 /* Some data was ACK'd, if still retransmitting (due to a
1019 * timeout), resend more of the retransmit queue. The
1020 * congestion window is handled properly by that code.
1022 if (tp->retransmits) {
1023 tcp_xmit_retransmit_queue(sk);
1024 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1025 } else {
1026 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
1027 if ((__s32)when < 0)
1028 when = 1;
1029 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
1033 /* This routine deals with incoming acks, but not outgoing ones. */
1034 static int tcp_ack(struct sock *sk, struct tcphdr *th,
1035 u32 ack_seq, u32 ack, int len)
1037 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1038 int flag = 0;
1039 u32 seq = 0;
1040 u32 seq_rtt = 0;
1042 if(sk->state == TCP_CLOSE)
1043 return 1; /* Dead, can't ack any more so why bother */
1045 /* If the ack is newer than sent or older than previous acks
1046 * then we can probably ignore it.
1048 if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
1049 goto uninteresting_ack;
1051 /* If there is data set flag 1 */
1052 if (len != th->doff*4)
1053 flag |= FLAG_DATA;
1055 /* Update our send window. */
1057 /* This is the window update code as per RFC 793
1058 * snd_wl{1,2} are used to prevent unordered
1059 * segments from shrinking the window
1061 if (before(tp->snd_wl1, ack_seq) ||
1062 (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
1063 u32 nwin = ntohs(th->window) << tp->snd_wscale;
1065 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
1066 flag |= FLAG_WIN_UPDATE;
1067 if (tp->snd_wnd != nwin) {
1068 tp->snd_wnd = nwin;
1070 /* Note, it is the only place, where
1071 * fast path is recovered for sending TCP.
1073 if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
1074 #ifdef TCP_FORMAL_WINDOW
1075 tcp_receive_window(tp) &&
1076 #endif
1077 !tp->urg_data)
1078 tcp_fast_path_on(tp);
1080 if (nwin > tp->max_window) {
1081 tp->max_window = nwin;
1082 tcp_sync_mss(sk, tp->pmtu_cookie);
1086 tp->snd_wl1 = ack_seq;
1087 tp->snd_wl2 = ack;
1091 /* BEWARE! From this place and until return from this function
1092 * snd_nxt and snd_wnd are out of sync. All the routines, called
1093 * from here must get "ack" as argument or they should not depend
1094 * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
1097 /* We passed data and got it acked, remove any soft error
1098 * log. Something worked...
1100 sk->err_soft = 0;
1101 tp->probes_out = 0;
1102 tp->rcv_tstamp = tcp_time_stamp;
1104 /* See if we can take anything off of the retransmit queue. */
1105 flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
1107 /* If this ack opens up a zero window, clear backoff. It was
1108 * being used to time the probes, and is probably far higher than
1109 * it needs to be for normal retransmission.
1111 if (tcp_timer_is_set(sk, TCP_TIME_PROBE0))
1112 tcp_ack_probe(sk, ack);
1114 /* We must do this here, before code below clears out important
1115 * state contained in tp->fackets_out and tp->retransmits. -DaveM
1117 if (should_advance_cwnd(tp, flag))
1118 tcp_cong_avoid(tp);
1120 /* If we have a timestamp, we always do rtt estimates. */
1121 if (tp->saw_tstamp) {
1122 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
1123 } else {
1124 /* If we were retransmiting don't count rtt estimate. */
1125 if (tp->retransmits) {
1126 if (tp->packets_out == 0) {
1127 tp->retransmits = 0;
1128 tp->fackets_out = 0;
1129 tp->retrans_out = 0;
1131 } else {
1132 /* We don't have a timestamp. Can only use
1133 * packets that are not retransmitted to determine
1134 * rtt estimates. Also, we must not reset the
1135 * backoff for rto until we get a non-retransmitted
1136 * packet. This allows us to deal with a situation
1137 * where the network delay has increased suddenly.
1138 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
1140 if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
1141 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
1142 tp->backoff = 0;
1143 tcp_rtt_estimator(tp, seq_rtt);
1144 tcp_set_rto(tp);
1145 tcp_bound_rto(tp);
1151 if (tp->packets_out) {
1152 if (flag & FLAG_DATA_ACKED)
1153 tcp_ack_packets_out(sk, tp);
1154 } else {
1155 tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
1158 flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
1159 if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
1160 (tp->high_seq != 0)) {
1161 tcp_fast_retrans(sk, ack, flag);
1162 } else {
1163 /* Clear any aborted fast retransmit starts. */
1164 tp->dup_acks = 0;
1166 /* It is not a brain fart, I thought a bit now. 8)
1168 * Forward progress is indicated, if:
1169 * 1. the ack acknowledges new data.
1170 * 2. or the ack is duplicate, but it is caused by new segment
1171 * arrival. This case is filtered by:
1172 * - it contains no data, syn or fin.
1173 * - it does not update window.
1174 * 3. or new SACK. It is difficult to check, so that we ignore it.
1176 * Forward progress is also indicated by arrival new data,
1177 * which was caused by window open from our side. This case is more
1178 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
1179 * --ANK (990513)
1181 if (ack != tp->snd_una || (flag == 0 && !th->fin))
1182 dst_confirm(sk->dst_cache);
1184 if (ack != tp->snd_una)
1185 tp->sorry = 1;
1187 /* Remember the highest ack received. */
1188 tp->snd_una = ack;
1189 return 1;
1191 uninteresting_ack:
1192 SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
1193 return 0;
1196 int tcp_paws_check(struct tcp_opt *tp, int rst)
1198 if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
1199 return 0;
1200 if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
1201 return 0;
1203 /* RST segments are not recommended to carry timestamp,
1204 and, if they do, it is recommended to ignore PAWS because
1205 "their cleanup function should take precedence over timestamps."
1206 Certainly, it is mistake. It is necessary to understand the reasons
1207 of this constraint to relax it: if peer reboots, clock may go
1208 out-of-sync and half-open connections will not be reset.
1209 Actually, the problem would be not existing if all
1210 the implementations followed draft about maintaining clock
1211 via reboots. Linux-2.2 DOES NOT!
1213 However, we can relax time bounds for RST segments to MSL.
1215 if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
1216 return 0;
1217 return 1;
1220 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
1222 if (seq == s_win)
1223 return 1;
1224 if (after(end_seq, s_win) && before(seq, e_win))
1225 return 1;
1226 return (seq == e_win && seq == end_seq);
1229 /* New-style handling of TIME_WAIT sockets. */
1231 /* Must be called with locally disabled BHs. */
1232 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
1234 struct tcp_ehash_bucket *ehead;
1235 struct tcp_bind_hashbucket *bhead;
1236 struct tcp_bind_bucket *tb;
1238 /* Unlink from established hashes. */
1239 ehead = &tcp_ehash[tw->hashent];
1240 write_lock(&ehead->lock);
1241 if (!tw->pprev) {
1242 write_unlock(&ehead->lock);
1243 return;
1245 if(tw->next)
1246 tw->next->pprev = tw->pprev;
1247 *(tw->pprev) = tw->next;
1248 tw->pprev = NULL;
1249 write_unlock(&ehead->lock);
1251 /* Disassociate with bind bucket. */
1252 bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
1253 spin_lock(&bhead->lock);
1254 if ((tb = tw->tb) != NULL) {
1255 if(tw->bind_next)
1256 tw->bind_next->bind_pprev = tw->bind_pprev;
1257 *(tw->bind_pprev) = tw->bind_next;
1258 tw->tb = NULL;
1259 if (tb->owners == NULL) {
1260 if (tb->next)
1261 tb->next->pprev = tb->pprev;
1262 *(tb->pprev) = tb->next;
1263 kmem_cache_free(tcp_bucket_cachep, tb);
1266 spin_unlock(&bhead->lock);
1268 #ifdef INET_REFCNT_DEBUG
1269 if (atomic_read(&tw->refcnt) != 1) {
1270 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
1272 #endif
1273 tcp_tw_put(tw);
1277 * * Main purpose of TIME-WAIT state is to close connection gracefully,
1278 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
1279 * (and, probably, tail of data) and one or more our ACKs are lost.
1280 * * What is TIME-WAIT timeout? It is associated with maximal packet
1281 * lifetime in the internet, which results in wrong conclusion, that
1282 * it is set to catch "old duplicate segments" wandering out of their path.
1283 * It is not quite correct. This timeout is calculated so that it exceeds
1284 * maximal retransmision timeout enough to allow to lose one (or more)
1285 * segments sent by peer and our ACKs. This time may be calculated from RTO.
1286 * * When TIME-WAIT socket receives RST, it means that another end
1287 * finally closed and we are allowed to kill TIME-WAIT too.
1288 * * Second purpose of TIME-WAIT is catching old duplicate segments.
1289 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
1290 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
1291 * * If we invented some more clever way to catch duplicates
1292 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
1294 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
1295 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
1296 * from the very beginning.
1298 * NOTE. With recycling (and later with fin-wait-2) TW bucket
1299 * is _not_ stateless. It means, that strictly speaking we must
1300 * spinlock it. I do not want! Well, probability of misbehaviour
1301 * is ridiculously low and, seems, we could use some mb() tricks
1302 * to avoid misread sequence numbers, states etc. --ANK
1304 enum tcp_tw_status
1305 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
1306 struct tcphdr *th, unsigned len)
1308 struct tcp_opt tp;
1309 int paws_reject = 0;
1311 tp.saw_tstamp = 0;
1312 if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
1313 tcp_parse_options(NULL, th, &tp, 0);
1315 if (tp.saw_tstamp) {
1316 tp.ts_recent = tw->ts_recent;
1317 tp.ts_recent_stamp = tw->ts_recent_stamp;
1318 paws_reject = tcp_paws_check(&tp, th->rst);
1322 if (tw->substate == TCP_FIN_WAIT2) {
1323 /* Just repeat all the checks of tcp_rcv_state_process() */
1325 /* Out of window, send ACK */
1326 if (paws_reject ||
1327 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1328 tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
1329 return TCP_TW_ACK;
1331 if (th->rst)
1332 goto kill;
1334 if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq)
1335 goto kill_with_rst;
1337 /* Dup ACK? */
1338 if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) ||
1339 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
1340 tcp_tw_put(tw);
1341 return TCP_TW_SUCCESS;
1344 /* New data or FIN. If new data arrive after half-duplex close,
1345 * reset.
1347 if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
1348 kill_with_rst:
1349 tcp_tw_deschedule(tw);
1350 tcp_timewait_kill(tw);
1351 tcp_tw_put(tw);
1352 return TCP_TW_RST;
1355 /* FIN arrived, enter true time-wait state. */
1356 tw->substate = TCP_TIME_WAIT;
1357 tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1358 if (tp.saw_tstamp) {
1359 tw->ts_recent_stamp = xtime.tv_sec;
1360 tw->ts_recent = tp.rcv_tsval;
1363 /* I am shamed, but failed to make it more elegant.
1364 * Yes, it is direct reference to IP, which is impossible
1365 * to generalize to IPv6. Taking into account that IPv6
1366 * do not undertsnad recycling in any case, it not
1367 * a big problem in practice. --ANK */
1368 if (tw->family == AF_INET &&
1369 sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
1370 tcp_v4_tw_remember_stamp(tw))
1371 tcp_tw_schedule(tw, tw->timeout);
1372 else
1373 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1374 return TCP_TW_ACK;
1378 * Now real TIME-WAIT state.
1380 * RFC 1122:
1381 * "When a connection is [...] on TIME-WAIT state [...]
1382 * [a TCP] MAY accept a new SYN from the remote TCP to
1383 * reopen the connection directly, if it:
1385 * (1) assigns its initial sequence number for the new
1386 * connection to be larger than the largest sequence
1387 * number it used on the previous connection incarnation,
1388 * and
1390 * (2) returns to TIME-WAIT state if the SYN turns out
1391 * to be an old duplicate".
1394 if (!paws_reject &&
1395 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
1396 TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
1397 /* In window segment, it may be only reset or bare ack. */
1399 if (th->rst) {
1400 /* This is TIME_WAIT assasination, in two flavors.
1401 * Oh well... nobody has a sufficient solution to this
1402 * protocol bug yet.
1404 if (sysctl_tcp_rfc1337 == 0) {
1405 kill:
1406 tcp_tw_deschedule(tw);
1407 tcp_timewait_kill(tw);
1408 tcp_tw_put(tw);
1409 return TCP_TW_SUCCESS;
1412 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1414 if (tp.saw_tstamp) {
1415 tw->ts_recent = tp.rcv_tsval;
1416 tw->ts_recent_stamp = xtime.tv_sec;
1419 tcp_tw_put(tw);
1420 return TCP_TW_SUCCESS;
1423 /* Out of window segment.
1425 All the segments are ACKed immediately.
1427 The only exception is new SYN. We accept it, if it is
1428 not old duplicate and we are not in danger to be killed
1429 by delayed old duplicates. RFC check is that it has
1430 newer sequence number works at rates <40Mbit/sec.
1431 However, if paws works, it is reliable AND even more,
1432 we even may relax silly seq space cutoff.
1434 RED-PEN: we violate main RFC requirement, if this SYN will appear
1435 old duplicate (i.e. we receive RST in reply to SYN-ACK),
1436 we must return socket to time-wait state. It is not good,
1437 but not fatal yet.
1440 if (th->syn && !th->rst && !th->ack && !paws_reject &&
1441 (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
1442 (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
1443 u32 isn = tw->snd_nxt + 2;
1444 if (isn == 0)
1445 isn++;
1446 TCP_SKB_CB(skb)->when = isn;
1447 return TCP_TW_SYN;
1450 if (paws_reject)
1451 NET_INC_STATS_BH(PAWSEstabRejected);
1453 if(!th->rst) {
1454 /* In this case we must reset the TIMEWAIT timer.
1456 * If it is ACKless SYN it may be both old duplicate
1457 * and new good SYN with random sequence number <rcv_nxt.
1458 * Do not reschedule in the last case.
1460 if (paws_reject || th->ack)
1461 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
1463 /* Send ACK. Note, we do not put the bucket,
1464 * it will be released by caller.
1466 return TCP_TW_ACK;
1468 tcp_tw_put(tw);
1469 return TCP_TW_SUCCESS;
1472 /* Enter the time wait state. This is called with locally disabled BH.
1473 * Essentially we whip up a timewait bucket, copy the
1474 * relevant info into it from the SK, and mess with hash chains
1475 * and list linkage.
1477 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1479 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
1480 struct tcp_bind_hashbucket *bhead;
1481 struct sock **head, *sktw;
1483 write_lock(&ehead->lock);
1485 /* Step 1: Remove SK from established hash. */
1486 if (sk->pprev) {
1487 if(sk->next)
1488 sk->next->pprev = sk->pprev;
1489 *sk->pprev = sk->next;
1490 sk->pprev = NULL;
1491 sock_prot_dec_use(sk->prot);
1494 /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
1495 head = &(ehead + tcp_ehash_size)->chain;
1496 sktw = (struct sock *)tw;
1497 if((sktw->next = *head) != NULL)
1498 (*head)->pprev = &sktw->next;
1499 *head = sktw;
1500 sktw->pprev = head;
1501 atomic_inc(&tw->refcnt);
1503 write_unlock(&ehead->lock);
1505 /* Step 3: Put TW into bind hash. Original socket stays there too.
1506 Note, that any socket with sk->num!=0 MUST be bound in binding
1507 cache, even if it is closed.
1509 bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
1510 spin_lock(&bhead->lock);
1511 tw->tb = (struct tcp_bind_bucket *)sk->prev;
1512 BUG_TRAP(sk->prev!=NULL);
1513 if ((tw->bind_next = tw->tb->owners) != NULL)
1514 tw->tb->owners->bind_pprev = &tw->bind_next;
1515 tw->tb->owners = (struct sock*)tw;
1516 tw->bind_pprev = &tw->tb->owners;
1517 spin_unlock(&bhead->lock);
1521 * Move a socket to time-wait or dead fin-wait-2 state.
1523 void tcp_time_wait(struct sock *sk, int state, int timeo)
1525 struct tcp_tw_bucket *tw = NULL;
1526 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1527 int recycle_ok = 0;
1529 if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
1530 recycle_ok = tp->af_specific->remember_stamp(sk);
1532 if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
1533 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1535 if(tw != NULL) {
1536 int rto = (tp->rto<<2) - (tp->rto>>1);
1538 /* Give us an identity. */
1539 tw->daddr = sk->daddr;
1540 tw->rcv_saddr = sk->rcv_saddr;
1541 tw->bound_dev_if= sk->bound_dev_if;
1542 tw->num = sk->num;
1543 tw->state = TCP_TIME_WAIT;
1544 tw->substate = state;
1545 tw->sport = sk->sport;
1546 tw->dport = sk->dport;
1547 tw->family = sk->family;
1548 tw->reuse = sk->reuse;
1549 tw->rcv_wscale = tp->rcv_wscale;
1550 atomic_set(&tw->refcnt, 0);
1552 tw->hashent = sk->hashent;
1553 tw->rcv_nxt = tp->rcv_nxt;
1554 tw->snd_nxt = tp->snd_nxt;
1555 tw->rcv_wnd = tcp_receive_window(tp);
1556 tw->syn_seq = tp->syn_seq;
1557 tw->ts_recent = tp->ts_recent;
1558 tw->ts_recent_stamp= tp->ts_recent_stamp;
1559 tw->pprev_death = NULL;
1561 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1562 if(tw->family == PF_INET6) {
1563 memcpy(&tw->v6_daddr,
1564 &sk->net_pinfo.af_inet6.daddr,
1565 sizeof(struct in6_addr));
1566 memcpy(&tw->v6_rcv_saddr,
1567 &sk->net_pinfo.af_inet6.rcv_saddr,
1568 sizeof(struct in6_addr));
1570 #endif
1571 /* Linkage updates. */
1572 __tcp_tw_hashdance(sk, tw);
1574 /* Get the TIME_WAIT timeout firing. */
1575 if (timeo < rto)
1576 timeo = rto;
1578 if (recycle_ok) {
1579 tw->timeout = rto;
1580 } else {
1581 tw->timeout = TCP_TIMEWAIT_LEN;
1582 if (state == TCP_TIME_WAIT)
1583 timeo = TCP_TIMEWAIT_LEN;
1586 tcp_tw_schedule(tw, timeo);
1587 } else {
1588 /* Sorry, if we're out of memory, just CLOSE this
1589 * socket up. We've got bigger problems than
1590 * non-graceful socket closings.
1592 if (net_ratelimit())
1593 printk(KERN_INFO "TCP: time wait bucket table overflow\n");
1596 tcp_update_metrics(sk);
1597 tcp_done(sk);
1601 * Process the FIN bit. This now behaves as it is supposed to work
1602 * and the FIN takes effect when it is validly part of sequence
1603 * space. Not before when we get holes.
1605 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1606 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1607 * TIME-WAIT)
1609 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1610 * close and we go into CLOSING (and later onto TIME-WAIT)
1612 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1615 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1617 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1619 tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
1620 tp->ack.pending = 1;
1621 tp->ack.quick = 0;
1623 sk->shutdown |= RCV_SHUTDOWN;
1625 switch(sk->state) {
1626 case TCP_SYN_RECV:
1627 case TCP_ESTABLISHED:
1628 /* Move to CLOSE_WAIT */
1629 tcp_set_state(sk, TCP_CLOSE_WAIT);
1630 break;
1632 case TCP_CLOSE_WAIT:
1633 case TCP_CLOSING:
1634 /* Received a retransmission of the FIN, do
1635 * nothing.
1637 break;
1638 case TCP_LAST_ACK:
1639 /* RFC793: Remain in the LAST-ACK state. */
1640 break;
1642 case TCP_FIN_WAIT1:
1643 /* This case occurs when a simultaneous close
1644 * happens, we must ack the received FIN and
1645 * enter the CLOSING state.
1647 tcp_set_state(sk, TCP_CLOSING);
1648 break;
1649 case TCP_FIN_WAIT2:
1650 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1651 tcp_send_ack(sk);
1652 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
1653 break;
1654 default:
1655 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1656 * cases we should never reach this piece of code.
1658 printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1659 break;
1662 /* It _is_ possible, that we have something out-of-order _after_ FIN.
1663 * Probably, we should reset in this case. For now drop them.
1665 __skb_queue_purge(&tp->out_of_order_queue);
1666 if (tp->sack_ok)
1667 tp->num_sacks = 0;
1669 if (!sk->dead) {
1670 sk->state_change(sk);
1672 /* Do not send POLL_HUP for half duplex close. */
1673 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
1674 sk_wake_async(sk, 1, POLL_HUP);
1675 else
1676 sk_wake_async(sk, 1, POLL_IN);
1680 /* These routines update the SACK block as out-of-order packets arrive or
1681 * in-order packets close up the sequence space.
1683 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1685 int this_sack, num_sacks = tp->num_sacks;
1686 struct tcp_sack_block *swalk = &tp->selective_acks[0];
1688 /* If more than one SACK block, see if the recent change to SP eats into
1689 * or hits the sequence space of other SACK blocks, if so coalesce.
1691 if(num_sacks != 1) {
1692 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1693 if(swalk == sp)
1694 continue;
1696 /* First case, bottom of SP moves into top of the
1697 * sequence space of SWALK.
1699 if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1700 sp->start_seq = swalk->start_seq;
1701 goto coalesce;
1703 /* Second case, top of SP moves into bottom of the
1704 * sequence space of SWALK.
1706 if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1707 sp->end_seq = swalk->end_seq;
1708 goto coalesce;
1712 /* SP is the only SACK, or no coalescing cases found. */
1713 return;
1715 coalesce:
1716 /* Zap SWALK, by moving every further SACK up by one slot.
1717 * Decrease num_sacks.
1719 for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1720 struct tcp_sack_block *next = (swalk + 1);
1721 swalk->start_seq = next->start_seq;
1722 swalk->end_seq = next->end_seq;
1724 tp->num_sacks--;
1727 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1729 __u32 tmp;
1731 tmp = sack1->start_seq;
1732 sack1->start_seq = sack2->start_seq;
1733 sack2->start_seq = tmp;
1735 tmp = sack1->end_seq;
1736 sack1->end_seq = sack2->end_seq;
1737 sack2->end_seq = tmp;
1740 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1742 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1743 struct tcp_sack_block *sp = &tp->selective_acks[0];
1744 int cur_sacks = tp->num_sacks;
1746 if (!cur_sacks)
1747 goto new_sack;
1749 /* Optimize for the common case, new ofo frames arrive
1750 * "in order". ;-) This also satisfies the requirements
1751 * of RFC2018 about ordering of SACKs.
1753 if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1754 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1755 tcp_sack_maybe_coalesce(tp, sp);
1756 } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1757 /* Re-ordered arrival, in this case, can be optimized
1758 * as well.
1760 sp->start_seq = TCP_SKB_CB(skb)->seq;
1761 tcp_sack_maybe_coalesce(tp, sp);
1762 } else {
1763 struct tcp_sack_block *swap = sp + 1;
1764 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1766 /* Oh well, we have to move things around.
1767 * Try to find a SACK we can tack this onto.
1770 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1771 if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1772 (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1773 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1774 swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1775 else
1776 swap->start_seq = TCP_SKB_CB(skb)->seq;
1777 tcp_sack_swap(sp, swap);
1778 tcp_sack_maybe_coalesce(tp, sp);
1779 return;
1783 /* Could not find an adjacent existing SACK, build a new one,
1784 * put it at the front, and shift everyone else down. We
1785 * always know there is at least one SACK present already here.
1787 * If the sack array is full, forget about the last one.
1789 if (cur_sacks >= max_sacks) {
1790 cur_sacks--;
1791 tp->num_sacks--;
1793 while(cur_sacks >= 1) {
1794 struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1795 struct tcp_sack_block *prev = (this - 1);
1796 this->start_seq = prev->start_seq;
1797 this->end_seq = prev->end_seq;
1798 cur_sacks--;
1801 new_sack:
1802 /* Build the new head SACK, and we're done. */
1803 sp->start_seq = TCP_SKB_CB(skb)->seq;
1804 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1805 tp->num_sacks++;
1809 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1811 struct tcp_sack_block *sp = &tp->selective_acks[0];
1812 int num_sacks = tp->num_sacks;
1813 int this_sack;
1815 /* This is an in order data segment _or_ an out-of-order SKB being
1816 * moved to the receive queue, so we know this removed SKB will eat
1817 * from the front of a SACK.
1819 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1820 /* Check if the start of the sack is covered by skb. */
1821 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1822 before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1823 break;
1826 /* This should only happen if so many SACKs get built that some get
1827 * pushed out before we get here, or we eat some in sequence packets
1828 * which are before the first SACK block.
1830 if(this_sack >= num_sacks)
1831 return;
1833 sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1834 if(!before(sp->start_seq, sp->end_seq)) {
1835 /* Zap this SACK, by moving forward any other SACKS. */
1836 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1837 struct tcp_sack_block *next = (sp + 1);
1838 sp->start_seq = next->start_seq;
1839 sp->end_seq = next->end_seq;
1841 tp->num_sacks--;
1845 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1847 struct tcp_sack_block *sp = &tp->selective_acks[0];
1848 int num_sacks = tp->num_sacks;
1849 int this_sack;
1851 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1852 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1853 break;
1855 if(this_sack >= num_sacks)
1856 return;
1857 sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1861 /* This one checks to see if we can put data from the
1862 * out_of_order queue into the receive_queue.
1864 static void tcp_ofo_queue(struct sock *sk)
1866 struct sk_buff *skb;
1867 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1869 while ((skb = skb_peek(&tp->out_of_order_queue))) {
1870 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1871 break;
1873 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1874 SOCK_DEBUG(sk, "ofo packet was already received \n");
1875 __skb_unlink(skb, skb->list);
1876 kfree_skb(skb);
1877 continue;
1879 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1880 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1881 TCP_SKB_CB(skb)->end_seq);
1883 if(tp->sack_ok)
1884 tcp_sack_remove_skb(tp, skb);
1885 __skb_unlink(skb, skb->list);
1886 __skb_queue_tail(&sk->receive_queue, skb);
1887 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1888 if(skb->h.th->fin)
1889 tcp_fin(skb, sk, skb->h.th);
1893 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1895 struct sk_buff *skb1;
1896 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1897 int eaten = 0;
1899 /* Queue data for delivery to the user.
1900 * Packets in sequence go to the receive queue.
1901 * Out of sequence packets to the out_of_order_queue.
1903 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1904 /* Ok. In sequence. */
1905 if (tp->ucopy.task == current &&
1906 tp->copied_seq == tp->rcv_nxt &&
1907 tp->ucopy.len &&
1908 sk->lock.users &&
1909 !tp->urg_data) {
1910 int chunk = min(skb->len, tp->ucopy.len);
1912 __set_current_state(TASK_RUNNING);
1914 local_bh_enable();
1915 if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
1916 sk->err = EFAULT;
1917 sk->error_report(sk);
1919 local_bh_disable();
1920 tp->ucopy.len -= chunk;
1921 tp->copied_seq += chunk;
1922 eaten = (chunk == skb->len && !skb->h.th->fin);
1925 if (!eaten) {
1926 queue_and_out:
1927 skb_set_owner_r(skb, sk);
1928 __skb_queue_tail(&sk->receive_queue, skb);
1930 dst_confirm(sk->dst_cache);
1931 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1932 if(skb->len)
1933 tcp_event_data_recv(tp, skb);
1934 if(skb->h.th->fin)
1935 tcp_fin(skb, sk, skb->h.th);
1937 /* This may have eaten into a SACK block. */
1938 if(tp->sack_ok && tp->num_sacks)
1939 tcp_sack_remove_skb(tp, skb);
1940 tcp_ofo_queue(sk);
1942 /* Turn on fast path. */
1943 if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
1944 #ifdef TCP_FORMAL_WINDOW
1945 tcp_receive_window(tp) &&
1946 #endif
1947 !tp->urg_data)
1948 tcp_fast_path_on(tp);
1950 if (eaten) {
1951 kfree_skb(skb);
1952 } else if (!sk->dead)
1953 sk->data_ready(sk, 0);
1954 return;
1957 /* An old packet, either a retransmit or some packet got lost. */
1958 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1959 /* A retransmit, 2nd most common case. Force an imediate ack.
1961 * It is impossible, seq is checked by top level.
1963 NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq));
1964 tcp_enter_quickack_mode(tp);
1965 tp->ack.pending = 1;
1966 kfree_skb(skb);
1967 return;
1970 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1971 /* Partial packet, seq < rcv_next < end_seq */
1972 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1973 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1974 TCP_SKB_CB(skb)->end_seq);
1976 goto queue_and_out;
1979 /* Ok. This is an out_of_order segment, force an ack. */
1980 tp->ack.pending = 1;
1982 /* Disable header prediction. */
1983 tp->pred_flags = 0;
1986 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1987 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1989 skb_set_owner_r(skb, sk);
1991 if (skb_peek(&tp->out_of_order_queue) == NULL) {
1992 /* Initial out of order segment, build 1 SACK. */
1993 if(tp->sack_ok) {
1994 tp->num_sacks = 1;
1995 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1996 tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1998 __skb_queue_head(&tp->out_of_order_queue,skb);
1999 } else {
2000 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
2001 /* Already there. */
2002 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
2003 if (skb->len >= skb1->len) {
2004 if(tp->sack_ok)
2005 tcp_sack_extend(tp, skb1, skb);
2006 __skb_append(skb1, skb);
2007 __skb_unlink(skb1, skb1->list);
2008 kfree_skb(skb1);
2009 } else {
2010 /* A duplicate, smaller than what is in the
2011 * out-of-order queue right now, toss it.
2013 kfree_skb(skb);
2015 break;
2018 if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
2019 __skb_append(skb1, skb);
2020 if(tp->sack_ok)
2021 tcp_sack_new_ofo_skb(sk, skb);
2022 break;
2025 /* See if we've hit the start. If so insert. */
2026 if (skb1 == skb_peek(&tp->out_of_order_queue)) {
2027 __skb_queue_head(&tp->out_of_order_queue,skb);
2028 if(tp->sack_ok)
2029 tcp_sack_new_ofo_skb(sk, skb);
2030 break;
2034 return;
2039 * This routine handles the data. If there is room in the buffer,
2040 * it will be have already been moved into it. If there is no
2041 * room, then we will just have to discard the packet.
2044 static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
2046 struct tcphdr *th;
2047 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2049 th = skb->h.th;
2050 skb_pull(skb, th->doff*4);
2051 skb_trim(skb, len - (th->doff*4));
2053 if (skb->len == 0 && !th->fin)
2054 goto drop;
2057 * If our receive queue has grown past its limits shrink it.
2058 * Make sure to do this before moving rcv_nxt, otherwise
2059 * data might be acked for that we don't have enough room.
2061 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
2062 if (prune_queue(sk) < 0) {
2063 /* Still not enough room. That can happen when
2064 * skb->true_size differs significantly from skb->len.
2066 goto drop;
2070 tcp_data_queue(sk, skb);
2072 if (before(tp->rcv_nxt, tp->copied_seq)) {
2073 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
2074 tp->rcv_nxt = tp->copied_seq;
2076 return;
2078 drop:
2079 kfree_skb(skb);
2082 /* When incoming ACK allowed to free some skb from write_queue,
2083 * we remember this in flag tp->sorry and wake up socket on the exit
2084 * from tcp input handler. Probably, handler has already eat this space
2085 * sending ACK and cloned frames from tcp_write_xmit().
2087 static __inline__ void tcp_new_space(struct sock *sk)
2089 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2090 struct socket *sock;
2092 tp->sorry = 0;
2094 if (sock_wspace(sk) >= tcp_min_write_space(sk) &&
2095 (sock = sk->socket) != NULL) {
2096 clear_bit(SOCK_NOSPACE, &sock->flags);
2098 if (sk->sleep && waitqueue_active(sk->sleep))
2099 wake_up_interruptible(sk->sleep);
2101 if (sock->fasync_list)
2102 sock_wake_async(sock, 2, POLL_OUT);
2106 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
2108 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2110 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
2111 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
2112 tcp_write_xmit(sk))
2113 tcp_check_probe_timer(sk, tp);
2116 static __inline__ void tcp_data_snd_check(struct sock *sk)
2118 struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
2120 if (skb != NULL)
2121 __tcp_data_snd_check(sk, skb);
2125 * Check if sending an ack is needed.
2127 static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
2129 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2131 /* This also takes care of updating the window.
2132 * This if statement needs to be simplified.
2134 * Rules for delaying an ack:
2135 * - delay time <= 0.5 HZ
2136 * - we don't have a window update to send
2137 * - must send at least every 2 full sized packets
2138 * - must send an ACK if we have any out of order data
2140 * With an extra heuristic to handle loss of packet
2141 * situations and also helping the sender leave slow
2142 * start in an expediant manner.
2145 /* More than one full frame received or... */
2146 if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
2147 #ifdef TCP_MORE_COARSE_ACKS
2148 /* Avoid to send immediate ACK from input path, if it
2149 * does not advance window far enough. tcp_recvmsg() will do this.
2151 && (!sysctl_tcp_retrans_collapse || __tcp_select_window(sk) >= tp->rcv_wnd)
2152 #endif
2153 ) ||
2154 /* We ACK each frame or... */
2155 tcp_in_quickack_mode(tp) ||
2156 /* We have out of order data or */
2157 (ofo_possible &&
2158 skb_peek(&tp->out_of_order_queue) != NULL)) {
2159 /* Then ack it now */
2160 tcp_send_ack(sk);
2161 } else {
2162 /* Else, send delayed ack. */
2163 tcp_send_delayed_ack(sk);
2167 static __inline__ void tcp_ack_snd_check(struct sock *sk)
2169 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2170 if (tp->ack.pending == 0) {
2171 /* We sent a data segment already. */
2172 return;
2174 __tcp_ack_snd_check(sk, 1);
2179 * This routine is only called when we have urgent data
2180 * signalled. Its the 'slow' part of tcp_urg. It could be
2181 * moved inline now as tcp_urg is only called from one
2182 * place. We handle URGent data wrong. We have to - as
2183 * BSD still doesn't use the correction from RFC961.
2184 * For 1003.1g we should support a new option TCP_STDURG to permit
2185 * either form (or just set the sysctl tcp_stdurg).
2188 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
2190 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2191 u32 ptr = ntohs(th->urg_ptr);
2193 if (ptr && !sysctl_tcp_stdurg)
2194 ptr--;
2195 ptr += ntohl(th->seq);
2197 /* Ignore urgent data that we've already seen and read. */
2198 if (after(tp->copied_seq, ptr))
2199 return;
2201 /* Do we already have a newer (or duplicate) urgent pointer? */
2202 if (tp->urg_data && !after(ptr, tp->urg_seq))
2203 return;
2205 /* Tell the world about our new urgent pointer. */
2206 if (sk->proc != 0) {
2207 if (sk->proc > 0)
2208 kill_proc(sk->proc, SIGURG, 1);
2209 else
2210 kill_pg(-sk->proc, SIGURG, 1);
2211 sk_wake_async(sk, 3, POLL_PRI);
2214 /* We may be adding urgent data when the last byte read was
2215 * urgent. To do this requires some care. We cannot just ignore
2216 * tp->copied_seq since we would read the last urgent byte again
2217 * as data, nor can we alter copied_seq until this data arrives
2218 * or we break the sematics of SIOCATMARK (and thus sockatmark())
2220 if (tp->urg_seq == tp->copied_seq)
2221 tp->copied_seq++; /* Move the copied sequence on correctly */
2222 tp->urg_data = TCP_URG_NOTYET;
2223 tp->urg_seq = ptr;
2225 /* Disable header prediction. */
2226 tp->pred_flags = 0;
2229 /* This is the 'fast' part of urgent handling. */
2230 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
2232 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2234 /* Check if we get a new urgent pointer - normally not. */
2235 if (th->urg)
2236 tcp_check_urg(sk,th);
2238 /* Do we wait for any urgent data? - normally not... */
2239 if (tp->urg_data == TCP_URG_NOTYET) {
2240 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
2242 /* Is the urgent pointer pointing into this packet? */
2243 if (ptr < len) {
2244 tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
2245 if (!sk->dead)
2246 sk->data_ready(sk,0);
2251 /* Clean the out_of_order queue if we can, trying to get
2252 * the socket within its memory limits again.
2254 * Return less than zero if we should start dropping frames
2255 * until the socket owning process reads some of the data
2256 * to stabilize the situation.
2258 static int prune_queue(struct sock *sk)
2260 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2261 struct sk_buff *skb;
2262 int pruned = 0;
2264 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
2266 NET_INC_STATS_BH(PruneCalled);
2268 /* First, purge the out_of_order queue. */
2269 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2270 if(skb != NULL) {
2271 /* Free it all. */
2272 do {
2273 pruned += skb->len;
2274 net_statistics[smp_processor_id()*2].OfoPruned += skb->len;
2275 kfree_skb(skb);
2276 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
2277 } while(skb != NULL);
2279 /* Reset SACK state. A conforming SACK implementation will
2280 * do the same at a timeout based retransmit. When a connection
2281 * is in a sad state like this, we care only about integrity
2282 * of the connection not performance.
2284 if(tp->sack_ok)
2285 tp->num_sacks = 0;
2288 /* If we are really being abused, tell the caller to silently
2289 * drop receive data on the floor. It will get retransmitted
2290 * and hopefully then we'll have sufficient space.
2292 * We used to try to purge the in-order packets too, but that
2293 * turns out to be deadly and fraught with races. Consider:
2295 * 1) If we acked the data, we absolutely cannot drop the
2296 * packet. This data would then never be retransmitted.
2297 * 2) It is possible, with a proper sequence of events involving
2298 * delayed acks and backlog queue handling, to have the user
2299 * read the data before it gets acked. The previous code
2300 * here got this wrong, and it lead to data corruption.
2301 * 3) Too much state changes happen when the FIN arrives, so once
2302 * we've seen that we can't remove any in-order data safely.
2304 * The net result is that removing in-order receive data is too
2305 * complex for anyones sanity. So we don't do it anymore. But
2306 * if we are really having our buffer space abused we stop accepting
2307 * new receive data.
2309 * 8) The arguments are interesting, but I even cannot imagine
2310 * what kind of arguments could force us to drop NICE, ALREADY
2311 * RECEIVED DATA only to get one more packet? --ANK
2313 * FIXME: it should recompute SACK state and only remove enough
2314 * buffers to get into bounds again. The current scheme loses
2315 * badly sometimes on links with large RTT, especially when
2316 * the driver has high overhead per skb.
2317 * (increasing the rcvbuf is not enough because it inflates the
2318 * the window too, disabling flow control effectively) -AK
2320 * Mmm... Why not to scale it seprately then? Just replace
2321 * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
2322 * and adjust it dynamically, when TCP window flow control
2323 * fails? -ANK
2326 tp->ack.quick = 0;
2328 if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
2329 return 0;
2331 NET_INC_STATS_BH(RcvPruned);
2333 /* Massive buffer overcommit. */
2334 return -1;
2337 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
2339 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2340 int chunk = skb->len - hlen;
2341 int err;
2343 local_bh_enable();
2344 if (skb->ip_summed==CHECKSUM_UNNECESSARY)
2345 err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
2346 else
2347 err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
2349 if (!err) {
2350 update:
2351 tp->ucopy.len -= chunk;
2352 tp->copied_seq += chunk;
2353 local_bh_disable();
2354 return 0;
2357 if (err == -EFAULT) {
2358 sk->err = EFAULT;
2359 sk->error_report(sk);
2360 goto update;
2363 local_bh_disable();
2364 return err;
2367 static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
2369 int result;
2371 if (sk->lock.users) {
2372 local_bh_enable();
2373 result = __tcp_checksum_complete(skb);
2374 local_bh_disable();
2375 } else {
2376 result = __tcp_checksum_complete(skb);
2378 return result;
2381 static __inline__ int
2382 tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
2384 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
2385 __tcp_checksum_complete_user(sk, skb);
2389 * TCP receive function for the ESTABLISHED state.
2391 * It is split into a fast path and a slow path. The fast path is
2392 * disabled when:
2393 * - A zero window was announced from us - zero window probing
2394 * is only handled properly in the slow path.
2395 * [ NOTE: actually, it was made incorrectly and nobody ever noticed
2396 * this! Reason is clear: 1. Correct senders do not send
2397 * to zero window. 2. Even if a sender sends to zero window,
2398 * nothing terrible occurs.
2400 * For now I cleaned this and fast path is really always disabled,
2401 * when window is zero, but I would be more happy to remove these
2402 * checks. Code will be only cleaner and _faster_. --ANK
2404 * Later note. I've just found that slow path also accepts
2405 * out of window segments, look at tcp_sequence(). So...
2406 * it is the last argument: I repair all and comment out
2407 * repaired code by TCP_FORMAL_WINDOW.
2408 * [ I remember one rhyme from a chidren's book. (I apologize,
2409 * the trasnlation is not rhymed 8)): people in one (jewish) village
2410 * decided to build sauna, but divided to two parties.
2411 * The first one insisted that battens should not be dubbed,
2412 * another objected that foots will suffer of splinters,
2413 * the first fended that dubbed wet battens are too slippy
2414 * and people will fall and it is much more serious!
2415 * Certaiinly, all they went to rabbi.
2416 * After some thinking, he judged: "Do not be lazy!
2417 * Certainly, dub the battens! But put them by dubbed surface down."
2421 * - Out of order segments arrived.
2422 * - Urgent data is expected.
2423 * - There is no buffer space left
2424 * - Unexpected TCP flags/window values/header lengths are received
2425 * (detected by checking the TCP header against pred_flags)
2426 * - Data is sent in both directions. Fast path only supports pure senders
2427 * or pure receivers (this means either the sequence number or the ack
2428 * value must stay constant)
2429 * - Unexpected TCP option.
2431 * When these conditions are not satisfied it drops into a standard
2432 * receive procedure patterned after RFC793 to handle all cases.
2433 * The first three cases are guaranteed by proper pred_flags setting,
2434 * the rest is checked inline. Fast processing is turned on in
2435 * tcp_data_queue when everything is OK.
2437 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
2438 struct tcphdr *th, unsigned len)
2440 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2443 * Header prediction.
2444 * The code losely follows the one in the famous
2445 * "30 instruction TCP receive" Van Jacobson mail.
2447 * Van's trick is to deposit buffers into socket queue
2448 * on a device interrupt, to call tcp_recv function
2449 * on the receive process context and checksum and copy
2450 * the buffer to user space. smart...
2452 * Our current scheme is not silly either but we take the
2453 * extra cost of the net_bh soft interrupt processing...
2454 * We do checksum and copy also but from device to kernel.
2457 /* RED-PEN. Using static variables to pass function arguments
2458 * cannot be good idea...
2460 tp->saw_tstamp = 0;
2462 /* pred_flags is 0xS?10 << 16 + snd_wnd
2463 * if header_predition is to be made
2464 * 'S' will always be tp->tcp_header_len >> 2
2465 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
2466 * turn it off (when there are holes in the receive
2467 * space for instance)
2468 * PSH flag is ignored.
2471 if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
2472 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
2473 int tcp_header_len = tp->tcp_header_len;
2475 /* Timestamp header prediction: tcp_header_len
2476 * is automatically equal to th->doff*4 due to pred_flags
2477 * match.
2480 /* Check timestamp */
2481 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
2482 __u32 *ptr = (__u32 *)(th + 1);
2484 /* No? Slow path! */
2485 if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
2486 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
2487 goto slow_path;
2489 tp->saw_tstamp = 1;
2490 ++ptr;
2491 tp->rcv_tsval = ntohl(*ptr);
2492 ++ptr;
2493 tp->rcv_tsecr = ntohl(*ptr);
2495 /* If PAWS failed, check it more carefully in slow path */
2496 if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
2497 goto slow_path;
2499 /* Predicted packet is in window by definition.
2500 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
2501 * Hence, check seq<=rcv_wup reduces to:
2503 if (tp->rcv_nxt == tp->rcv_wup) {
2504 tp->ts_recent = tp->rcv_tsval;
2505 tp->ts_recent_stamp = xtime.tv_sec;
2509 if (len <= tcp_header_len) {
2510 /* Bulk data transfer: sender */
2511 if (len == tcp_header_len) {
2512 /* We know that such packets are checksummed
2513 * on entry.
2515 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2516 TCP_SKB_CB(skb)->ack_seq, len);
2517 kfree_skb(skb);
2518 tcp_data_snd_check(sk);
2519 if (tp->sorry)
2520 tcp_new_space(sk);
2521 return 0;
2522 } else { /* Header too small */
2523 TCP_INC_STATS_BH(TcpInErrs);
2524 goto discard;
2526 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
2527 int eaten = 0;
2529 if (tp->ucopy.task == current &&
2530 tp->copied_seq == tp->rcv_nxt &&
2531 len - tcp_header_len <= tp->ucopy.len &&
2532 sk->lock.users) {
2533 eaten = 1;
2535 NET_INC_STATS_BH(TCPHPHitsToUser);
2537 __set_current_state(TASK_RUNNING);
2539 if (tcp_copy_to_iovec(sk, skb, tcp_header_len))
2540 goto csum_error;
2542 __skb_pull(skb,tcp_header_len);
2544 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2545 } else {
2546 if (tcp_checksum_complete_user(sk, skb))
2547 goto csum_error;
2549 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
2550 goto step5;
2552 NET_INC_STATS_BH(TCPHPHits);
2554 /* Bulk data transfer: receiver */
2555 __skb_pull(skb,tcp_header_len);
2557 /* DO NOT notify forward progress here.
2558 * It saves dozen of CPU instructions in fast path. --ANK
2559 * And where is it signaled then ? -AK
2560 * Nowhere. 8) --ANK
2562 __skb_queue_tail(&sk->receive_queue, skb);
2563 skb_set_owner_r(skb, sk);
2565 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2567 /* FIN bit check is not done since if FIN is set in
2568 * this frame, the pred_flags won't match up. -DaveM
2570 sk->data_ready(sk, 0);
2573 tcp_event_data_recv(tp, skb);
2575 #ifdef TCP_MORE_COARSE_ACKS
2576 if (eaten) {
2577 if (tcp_in_quickack_mode(tp)) {
2578 tcp_send_ack(sk);
2579 } else {
2580 tcp_send_delayed_ack(sk);
2582 } else
2583 #endif
2584 __tcp_ack_snd_check(sk, 0);
2586 if (eaten)
2587 kfree_skb(skb);
2588 return 0;
2590 /* Packet is in sequence, flags are trivial;
2591 * only ACK is strange. Jump to step 5.
2593 if (tcp_checksum_complete_user(sk, skb))
2594 goto csum_error;
2595 goto step5;
2598 slow_path:
2599 if (tcp_checksum_complete_user(sk, skb))
2600 goto csum_error;
2603 * RFC1323: H1. Apply PAWS check first.
2605 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
2606 tcp_paws_discard(tp, skb)) {
2607 if (!th->rst) {
2608 NET_INC_STATS_BH(PAWSEstabRejected);
2609 tcp_send_ack(sk);
2610 goto discard;
2612 /* Resets are accepted even if PAWS failed.
2614 ts_recent update must be made after we are sure
2615 that the packet is in window.
2620 * Standard slow path.
2623 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2624 /* RFC793, page 37: "In all states except SYN-SENT, all reset
2625 * (RST) segments are validated by checking their SEQ-fields."
2626 * And page 69: "If an incoming segment is not acceptable,
2627 * an acknowledgment should be sent in reply (unless the RST bit
2628 * is set, if so drop the segment and return)".
2630 if (th->rst)
2631 goto discard;
2632 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2633 SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
2634 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2635 tp->rcv_wup, tp->rcv_wnd);
2637 tcp_enter_quickack_mode(tp);
2638 tcp_send_ack(sk);
2639 NET_INC_STATS_BH(DelayedACKLost);
2640 goto discard;
2643 if(th->rst) {
2644 tcp_reset(sk);
2645 goto discard;
2648 if (tp->saw_tstamp) {
2649 tcp_replace_ts_recent(sk, tp,
2650 TCP_SKB_CB(skb)->seq);
2653 if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2654 SOCK_DEBUG(sk, "syn in established state\n");
2655 TCP_INC_STATS_BH(TcpInErrs);
2656 tcp_reset(sk);
2657 return 1;
2660 step5:
2661 if(th->ack)
2662 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
2664 /* Process urgent data. */
2665 tcp_urg(sk, th, len);
2667 /* step 7: process the segment text */
2668 tcp_data(skb, sk, len);
2670 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
2671 if(sk->state != TCP_CLOSE) {
2672 tcp_data_snd_check(sk);
2673 tcp_ack_snd_check(sk);
2674 if (tp->sorry)
2675 tcp_new_space(sk);
2678 return 0;
2680 csum_error:
2681 TCP_INC_STATS_BH(TcpInErrs);
2683 discard:
2684 kfree_skb(skb);
2685 return 0;
2689 /* This is not only more efficient than what we used to do, it eliminates
2690 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
2692 * Actually, we could lots of memory writes here. tp of listening
2693 * socket contains all necessary default parameters.
2695 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
2697 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
2699 if(newsk != NULL) {
2700 struct tcp_opt *newtp;
2701 #ifdef CONFIG_FILTER
2702 struct sk_filter *filter;
2703 #endif
2705 memcpy(newsk, sk, sizeof(*newsk));
2706 newsk->state = TCP_SYN_RECV;
2708 /* SANITY */
2709 newsk->pprev = NULL;
2710 newsk->prev = NULL;
2712 /* Clone the TCP header template */
2713 newsk->dport = req->rmt_port;
2715 sock_lock_init(newsk);
2716 bh_lock_sock(newsk);
2718 atomic_set(&newsk->rmem_alloc, 0);
2719 skb_queue_head_init(&newsk->receive_queue);
2720 atomic_set(&newsk->wmem_alloc, 0);
2721 skb_queue_head_init(&newsk->write_queue);
2722 atomic_set(&newsk->omem_alloc, 0);
2724 newsk->done = 0;
2725 newsk->proc = 0;
2726 newsk->backlog.head = newsk->backlog.tail = NULL;
2727 skb_queue_head_init(&newsk->error_queue);
2728 newsk->write_space = tcp_write_space;
2729 #ifdef CONFIG_FILTER
2730 if ((filter = newsk->filter) != NULL)
2731 sk_filter_charge(newsk, filter);
2732 #endif
2734 /* Now setup tcp_opt */
2735 newtp = &(newsk->tp_pinfo.af_tcp);
2736 newtp->pred_flags = 0;
2737 newtp->rcv_nxt = req->rcv_isn + 1;
2738 newtp->snd_nxt = req->snt_isn + 1;
2739 newtp->snd_una = req->snt_isn + 1;
2740 newtp->snd_sml = req->snt_isn + 1;
2742 tcp_delack_init(newtp);
2743 if (skb->len >= 536)
2744 newtp->ack.last_seg_size = skb->len;
2746 tcp_prequeue_init(newtp);
2748 newtp->snd_wl1 = req->rcv_isn;
2749 newtp->snd_wl2 = req->snt_isn;
2751 newtp->retransmits = 0;
2752 newtp->backoff = 0;
2753 newtp->srtt = 0;
2754 newtp->mdev = TCP_TIMEOUT_INIT;
2755 newtp->rto = TCP_TIMEOUT_INIT;
2757 newtp->packets_out = 0;
2758 newtp->fackets_out = 0;
2759 newtp->retrans_out = 0;
2760 newtp->snd_ssthresh = 0x7fffffff;
2762 /* So many TCP implementations out there (incorrectly) count the
2763 * initial SYN frame in their delayed-ACK and congestion control
2764 * algorithms that we must have the following bandaid to talk
2765 * efficiently to them. -DaveM
2767 newtp->snd_cwnd = 2;
2768 newtp->snd_cwnd_cnt = 0;
2769 newtp->high_seq = 0;
2771 newtp->dup_acks = 0;
2772 tcp_init_xmit_timers(newsk);
2773 skb_queue_head_init(&newtp->out_of_order_queue);
2774 newtp->send_head = newtp->retrans_head = NULL;
2775 newtp->rcv_wup = req->rcv_isn + 1;
2776 newtp->write_seq = req->snt_isn + 1;
2777 newtp->copied_seq = req->rcv_isn + 1;
2779 newtp->saw_tstamp = 0;
2781 newtp->probes_out = 0;
2782 newtp->num_sacks = 0;
2783 newtp->syn_seq = req->rcv_isn;
2784 newtp->fin_seq = req->rcv_isn;
2785 newtp->urg_data = 0;
2786 newtp->listen_opt = NULL;
2787 newtp->accept_queue = newtp->accept_queue_tail = NULL;
2788 /* Deinitialize syn_wait_lock to trap illegal accesses. */
2789 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
2791 /* Back to base struct sock members. */
2792 newsk->err = 0;
2793 newsk->priority = 0;
2794 atomic_set(&newsk->refcnt, 1);
2795 #ifdef INET_REFCNT_DEBUG
2796 atomic_inc(&inet_sock_nr);
2797 #endif
2799 if (newsk->keepopen)
2800 tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
2801 newsk->socket = NULL;
2802 newsk->sleep = NULL;
2804 newtp->tstamp_ok = req->tstamp_ok;
2805 if((newtp->sack_ok = req->sack_ok) != 0)
2806 newtp->num_sacks = 0;
2807 newtp->window_clamp = req->window_clamp;
2808 newtp->rcv_wnd = req->rcv_wnd;
2809 newtp->wscale_ok = req->wscale_ok;
2810 if (newtp->wscale_ok) {
2811 newtp->snd_wscale = req->snd_wscale;
2812 newtp->rcv_wscale = req->rcv_wscale;
2813 } else {
2814 newtp->snd_wscale = newtp->rcv_wscale = 0;
2815 newtp->window_clamp = min(newtp->window_clamp,65535);
2817 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
2818 newtp->max_window = newtp->snd_wnd;
2820 if (newtp->tstamp_ok) {
2821 newtp->ts_recent = req->ts_recent;
2822 newtp->ts_recent_stamp = xtime.tv_sec;
2823 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2824 } else {
2825 newtp->ts_recent_stamp = 0;
2826 newtp->tcp_header_len = sizeof(struct tcphdr);
2828 newtp->mss_clamp = req->mss;
2830 return newsk;
2834 * Process an incoming packet for SYN_RECV sockets represented
2835 * as an open_request.
2838 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
2839 struct open_request *req,
2840 struct open_request **prev)
2842 struct tcphdr *th = skb->h.th;
2843 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2844 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
2845 int paws_reject = 0;
2846 struct tcp_opt ttp;
2847 struct sock *child;
2849 ttp.saw_tstamp = 0;
2850 if (th->doff > (sizeof(struct tcphdr)>>2)) {
2851 tcp_parse_options(NULL, th, &ttp, 0);
2853 if (ttp.saw_tstamp) {
2854 ttp.ts_recent = req->ts_recent;
2855 /* We do not store true stamp, but it is not required,
2856 * it can be estimated (approximately)
2857 * from another data.
2859 ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
2860 paws_reject = tcp_paws_check(&ttp, th->rst);
2864 /* Check for pure retransmited SYN. */
2865 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
2866 flg == TCP_FLAG_SYN &&
2867 !paws_reject) {
2869 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
2870 * this case on figure 6 and figure 8, but formal
2871 * protocol description says NOTHING.
2872 * To be more exact, it says that we should send ACK,
2873 * because this segment (at least, if it has no data)
2874 * is out of window.
2876 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
2877 * describe SYN-RECV state. All the description
2878 * is wrong, we cannot believe to it and should
2879 * rely only on common sense and implementation
2880 * experience.
2882 * Enforce "SYN-ACK" according to figure 8, figure 6
2883 * of RFC793, fixed by RFC1122.
2885 req->class->rtx_syn_ack(sk, req, NULL);
2886 return NULL;
2889 /* Further reproduces section "SEGMENT ARRIVES"
2890 for state SYN-RECEIVED of RFC793.
2891 It is broken, however, it does not work only
2892 when SYNs are crossed, which is impossible in our
2893 case.
2895 But generally, we should (RFC lies!) to accept ACK
2896 from SYNACK both here and in tcp_rcv_state_process().
2897 tcp_rcv_state_process() does not, hence, we do not too.
2899 Note that the case is absolutely generic:
2900 we cannot optimize anything here without
2901 violating protocol. All the checks must be made
2902 before attempt to create socket.
2905 /* RFC793: "first check sequence number". */
2907 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2908 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
2909 /* Out of window: send ACK and drop. */
2910 if (!(flg & TCP_FLAG_RST))
2911 req->class->send_ack(skb, req);
2912 if (paws_reject)
2913 NET_INC_STATS_BH(PAWSEstabRejected);
2914 return NULL;
2917 /* In sequence, PAWS is OK. */
2919 if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
2920 req->ts_recent = ttp.rcv_tsval;
2922 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
2923 /* Truncate SYN, it is out of window starting
2924 at req->rcv_isn+1. */
2925 flg &= ~TCP_FLAG_SYN;
2928 /* RFC793: "second check the RST bit" and
2929 * "fourth, check the SYN bit"
2931 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
2932 goto embryonic_reset;
2934 /* RFC793: "fifth check the ACK field" */
2936 if (!(flg & TCP_FLAG_ACK))
2937 return NULL;
2939 /* Invalid ACK: reset will be sent by listening socket */
2940 if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
2941 return sk;
2942 /* Also, it would be not so bad idea to check rcv_tsecr, which
2943 * is essentially ACK extension and too early or too late values
2944 * should cause reset in unsynchronized states.
2947 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
2948 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
2949 req->acked = 1;
2950 return NULL;
2953 /* OK, ACK is valid, create big socket and
2954 * feed this segment to it. It will repeat all
2955 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
2956 * ESTABLISHED STATE. If it will be dropped after
2957 * socket is created, wait for troubles.
2959 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2960 if (child == NULL)
2961 goto listen_overflow;
2963 tcp_synq_unlink(tp, req, prev);
2964 tcp_synq_removed(sk, req);
2966 tcp_acceptq_queue(sk, req, child);
2967 return child;
2969 listen_overflow:
2970 if (!sysctl_tcp_abort_on_overflow) {
2971 req->acked = 1;
2972 return NULL;
2975 embryonic_reset:
2976 NET_INC_STATS_BH(EmbryonicRsts);
2977 if (!(flg & TCP_FLAG_RST))
2978 req->class->send_reset(skb);
2980 tcp_synq_drop(sk, req, prev);
2981 return NULL;
2985 * Queue segment on the new socket if the new socket is active,
2986 * otherwise we just shortcircuit this and continue with
2987 * the new socket.
2990 int tcp_child_process(struct sock *parent, struct sock *child,
2991 struct sk_buff *skb)
2993 int ret = 0;
2994 int state = child->state;
2996 if (child->lock.users == 0) {
2997 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
2999 /* Wakeup parent, send SIGIO */
3000 if (state == TCP_SYN_RECV && child->state != state)
3001 parent->data_ready(parent, 0);
3002 } else {
3003 /* Alas, it is possible again, because we do lookup
3004 * in main socket hash table and lock on listening
3005 * socket does not protect us more.
3007 sk_add_backlog(child, skb);
3010 bh_unlock_sock(child);
3011 return ret;
3014 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3015 struct tcphdr *th, unsigned len)
3017 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
3019 tcp_parse_options(sk, th, tp, 0);
3021 if (th->ack) {
3022 /* rfc793:
3023 * "If the state is SYN-SENT then
3024 * first check the ACK bit
3025 * If the ACK bit is set
3026 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
3027 * a reset (unless the RST bit is set, if so drop
3028 * the segment and return)"
3030 * I cite this place to emphasize one essential
3031 * detail, this check is different of one
3032 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
3033 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
3034 * because we have no previous data sent before SYN.
3035 * --ANK(990513)
3037 * We do not send data with SYN, so that RFC-correct
3038 * test reduces to:
3040 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
3041 return 1;
3043 /* Check not from any RFC, but it is evident consequence
3044 * of combining PAWS and usual SYN-SENT logic: ACK _is_
3045 * checked in SYN-SENT unlike another states, hence
3046 * echoed tstamp must be checked too.
3048 if (tp->saw_tstamp) {
3049 if (tp->rcv_tsecr == 0) {
3050 /* Workaround for bug in linux-2.1 and early
3051 * 2.2 kernels. Let's pretend that we did not
3052 * see such timestamp to avoid bogus rtt value,
3053 * calculated by tcp_ack().
3055 tp->saw_tstamp = 0;
3057 /* But do not forget to store peer's timestamp! */
3058 if (th->syn) {
3059 tp->ts_recent = tp->rcv_tsval;
3060 tp->ts_recent_stamp = xtime.tv_sec;
3062 } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 ||
3063 (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) {
3064 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n"));
3065 NET_INC_STATS_BH(PAWSActiveRejected);
3066 return 1;
3070 /* Now ACK is acceptable.
3072 * "If the RST bit is set
3073 * If the ACK was acceptable then signal the user "error:
3074 * connection reset", drop the segment, enter CLOSED state,
3075 * delete TCB, and return."
3078 if (th->rst) {
3079 tcp_reset(sk);
3080 goto discard;
3083 /* rfc793:
3084 * "fifth, if neither of the SYN or RST bits is set then
3085 * drop the segment and return."
3087 * See note below!
3088 * --ANK(990513)
3090 if (!th->syn)
3091 goto discard;
3093 /* rfc793:
3094 * "If the SYN bit is on ...
3095 * are acceptable then ...
3096 * (our SYN has been ACKed), change the connection
3097 * state to ESTABLISHED..."
3099 * Do you see? SYN-less ACKs in SYN-SENT state are
3100 * completely ignored.
3102 * The bug causing stalled SYN-SENT sockets
3103 * was here: tcp_ack advanced snd_una and canceled
3104 * retransmit timer, so that bare ACK received
3105 * in SYN-SENT state (even with invalid ack==ISS,
3106 * because tcp_ack check is too weak for SYN-SENT)
3107 * causes moving socket to invalid semi-SYN-SENT,
3108 * semi-ESTABLISHED state and connection hangs.
3109 * --ANK (990514)
3111 * Bare ACK is valid, however.
3112 * Actually, RFC793 requires to send such ACK
3113 * in reply to any out of window packet.
3114 * It is wrong, but Linux also send such
3115 * useless ACKs sometimes.
3116 * --ANK (990724)
3119 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3120 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
3121 TCP_SKB_CB(skb)->ack_seq, len);
3123 /* Ok.. it's good. Set up sequence numbers and
3124 * move to established.
3126 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
3127 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
3129 /* RFC1323: The window in SYN & SYN/ACK segments is
3130 * never scaled.
3132 tp->snd_wnd = ntohs(th->window);
3133 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3134 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3135 tp->fin_seq = TCP_SKB_CB(skb)->seq;
3137 tcp_set_state(sk, TCP_ESTABLISHED);
3139 if (tp->wscale_ok == 0) {
3140 tp->snd_wscale = tp->rcv_wscale = 0;
3141 tp->window_clamp = min(tp->window_clamp,65535);
3144 if (tp->tstamp_ok) {
3145 tp->tcp_header_len =
3146 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
3147 } else
3148 tp->tcp_header_len = sizeof(struct tcphdr);
3149 if (tp->saw_tstamp) {
3150 tp->ts_recent = tp->rcv_tsval;
3151 tp->ts_recent_stamp = xtime.tv_sec;
3153 tcp_sync_mss(sk, tp->pmtu_cookie);
3154 tcp_initialize_rcv_mss(sk);
3155 tcp_init_metrics(sk);
3156 tcp_init_buffer_space(sk);
3158 if (sk->keepopen)
3159 tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
3161 tp->copied_seq = tp->rcv_nxt;
3162 __tcp_fast_path_on(tp, tp->snd_wnd);
3164 if(!sk->dead) {
3165 sk->state_change(sk);
3166 sk_wake_async(sk, 0, POLL_OUT);
3169 if (tp->write_pending) {
3170 /* Save one ACK. Data will be ready after
3171 * several ticks, if write_pending is set.
3173 * It may be deleted, but with this feature tcpdumps
3174 * look so _wonderfully_ clever, that I was not able
3175 * to stand against the temptation 8) --ANK
3177 tp->ack.pending = 1;
3178 tp->ack.lrcvtime = tcp_time_stamp;
3179 tcp_enter_quickack_mode(tp);
3180 tp->ack.ato = TCP_ATO_MIN;
3181 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
3182 goto discard;
3183 } else {
3184 tcp_send_ack(sk);
3186 return -1;
3189 /* No ACK in the segment */
3191 if (th->rst) {
3192 /* rfc793:
3193 * "If the RST bit is set
3195 * Otherwise (no ACK) drop the segment and return."
3198 goto discard;
3201 /* PAWS check. */
3202 if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0))
3203 goto discard;
3205 if (th->syn) {
3206 /* We see SYN without ACK. It is attempt of
3207 * simultaneous connect with crossed SYNs.
3209 * The previous version of the code
3210 * checked for "connecting to self"
3211 * here. that check is done now in
3212 * tcp_connect.
3214 * RED-PEN: BTW, it does not. 8)
3216 tcp_set_state(sk, TCP_SYN_RECV);
3217 if (tp->saw_tstamp) {
3218 tp->ts_recent = tp->rcv_tsval;
3219 tp->ts_recent_stamp = xtime.tv_sec;
3222 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
3223 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
3225 /* RFC1323: The window in SYN & SYN/ACK segments is
3226 * never scaled.
3228 tp->snd_wnd = ntohs(th->window);
3229 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3230 tp->max_window = tp->snd_wnd;
3232 tcp_sync_mss(sk, tp->pmtu_cookie);
3233 tcp_initialize_rcv_mss(sk);
3235 tcp_send_synack(sk);
3236 #if 0
3237 /* Note, we could accept data and URG from this segment.
3238 * There are no obstacles to make this.
3240 * However, if we ignore data in ACKless segments sometimes,
3241 * we have no reasons to accept it sometimes.
3242 * Also, seems the code doing it in step6 of tcp_rcv_state_process
3243 * is not flawless. So, discard packet for sanity.
3244 * Uncomment this return to process the data.
3246 return -1;
3247 #endif
3249 /* "fifth, if neither of the SYN or RST bits is set then
3250 * drop the segment and return."
3253 discard:
3254 kfree_skb(skb);
3255 return 0;
3260 * This function implements the receiving procedure of RFC 793 for
3261 * all states except ESTABLISHED and TIME_WAIT.
3262 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
3263 * address independent.
3266 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
3267 struct tcphdr *th, unsigned len)
3269 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
3270 int queued = 0;
3272 tp->saw_tstamp = 0;
3274 switch (sk->state) {
3275 case TCP_CLOSE:
3276 /* When state == CLOSED, hash lookup always fails.
3278 * But, there is a back door, the backlog queue.
3279 * If we have a sequence of packets in the backlog
3280 * during __release_sock() which have a sequence such
3281 * that:
3282 * packet X causes entry to TCP_CLOSE state
3283 * ...
3284 * packet X + N has FIN bit set
3286 * We report a (luckily) harmless error in this case.
3287 * The issue is that backlog queue processing bypasses
3288 * any hash lookups (we know which socket packets are for).
3289 * The correct behavior here is what 2.0.x did, since
3290 * a TCP_CLOSE socket does not exist. Drop the frame
3291 * and send a RST back to the other end.
3294 /* 1. The socket may be moved to TIME-WAIT state.
3295 2. While this socket was locked, another socket
3296 with the same identity could be created.
3297 3. To continue?
3299 CONCLUSION: discard and only discard!
3301 Alternative would be relookup and recurse into tcp_v?_rcv
3302 (not *_do_rcv) to work with timewait and listen states
3303 correctly.
3305 goto discard;
3307 case TCP_LISTEN:
3308 if(th->ack)
3309 return 1;
3311 if(th->syn) {
3312 if(tp->af_specific->conn_request(sk, skb) < 0)
3313 return 1;
3315 /* Now we have several options: In theory there is
3316 * nothing else in the frame. KA9Q has an option to
3317 * send data with the syn, BSD accepts data with the
3318 * syn up to the [to be] advertised window and
3319 * Solaris 2.1 gives you a protocol error. For now
3320 * we just ignore it, that fits the spec precisely
3321 * and avoids incompatibilities. It would be nice in
3322 * future to drop through and process the data.
3324 * Now that TTCP is starting to be used we ought to
3325 * queue this data.
3326 * But, this leaves one open to an easy denial of
3327 * service attack, and SYN cookies can't defend
3328 * against this problem. So, we drop the data
3329 * in the interest of security over speed.
3331 goto discard;
3333 goto discard;
3335 case TCP_SYN_SENT:
3336 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
3337 if (queued >= 0)
3338 return queued;
3339 queued = 0;
3340 goto step6;
3343 /* Parse the tcp_options present on this header.
3344 * By this point we really only expect timestamps.
3345 * Note that this really has to be here and not later for PAWS
3346 * (RFC1323) to work.
3348 if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
3349 tcp_paws_discard(tp, skb)) {
3350 if (!th->rst) {
3351 tcp_send_ack(sk);
3352 goto discard;
3354 /* Reset is accepted even if it did not pass PAWS. */
3357 /* The silly FIN test here is necessary to see an advancing ACK in
3358 * retransmitted FIN frames properly. Consider the following sequence:
3360 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
3361 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
3362 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
3363 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
3365 * At this point the connection will deadlock with host1 believing
3366 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
3367 * forever. The following fix is from Taral (taral@taral.net).
3369 * RED-PEN. Seems, the above is not true.
3370 * If at least one end is RFC compliant, it will send ACK to
3371 * out of window FIN and, hence, move peer to TIME-WAIT.
3372 * I comment out this line. --ANK
3374 * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
3375 * received in SYN-RECV. The problem is that description of
3376 * segment processing in SYN-RECV state in RFC792 is WRONG.
3377 * Correct check would accept ACK from this SYN-ACK, see
3378 * figures 6 and 8 (fixed by RFC1122). Compare this
3379 * to problem with FIN, they smell similarly. --ANK
3382 /* step 1: check sequence number */
3383 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
3384 #if 0
3385 && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
3386 #endif
3388 if (!th->rst) {
3389 NET_INC_STATS_BH(DelayedACKLost);
3390 tcp_enter_quickack_mode(tp);
3391 tcp_send_ack(sk);
3393 goto discard;
3396 /* step 2: check RST bit */
3397 if(th->rst) {
3398 tcp_reset(sk);
3399 goto discard;
3402 if (tp->saw_tstamp) {
3403 tcp_replace_ts_recent(sk, tp,
3404 TCP_SKB_CB(skb)->seq);
3407 /* step 3: check security and precedence [ignored] */
3409 /* step 4:
3411 * Check for a SYN, and ensure it matches the SYN we were
3412 * first sent. We have to handle the rather unusual (but valid)
3413 * sequence that KA9Q derived products may generate of
3415 * SYN
3416 * SYN|ACK Data
3417 * ACK (lost)
3418 * SYN|ACK Data + More Data
3419 * .. we must ACK not RST...
3421 * We keep syn_seq as the sequence space occupied by the
3422 * original syn.
3425 if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
3426 tcp_reset(sk);
3427 return 1;
3430 /* step 5: check the ACK field */
3431 if (th->ack) {
3432 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
3433 TCP_SKB_CB(skb)->ack_seq, len);
3435 switch(sk->state) {
3436 case TCP_SYN_RECV:
3437 if (acceptable) {
3438 tcp_set_state(sk, TCP_ESTABLISHED);
3439 tp->copied_seq = tp->rcv_nxt;
3441 /* Note, that this wakeup is only for marginal
3442 * crossed SYN case. Passively open sockets
3443 * are not waked up, because sk->sleep == NULL
3444 * and sk->socket == NULL.
3446 if (!sk->dead) {
3447 sk->state_change(sk);
3448 sk_wake_async(sk,0,POLL_OUT);
3451 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
3452 tp->snd_wnd = ntohs(th->window) << tp->snd_wscale;
3453 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
3454 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
3456 /* tcp_ack considers this ACK as duplicate
3457 * and does not calculate rtt.
3458 * Fix it at least with timestamps.
3460 if (tp->saw_tstamp && !tp->srtt)
3461 tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
3463 tcp_init_metrics(sk);
3464 tcp_fast_path_on(tp);
3465 } else {
3466 SOCK_DEBUG(sk, "bad ack\n");
3467 return 1;
3469 break;
3471 case TCP_FIN_WAIT1:
3472 if (tp->snd_una == tp->write_seq) {
3473 tcp_set_state(sk, TCP_FIN_WAIT2);
3474 sk->shutdown |= SEND_SHUTDOWN;
3475 dst_confirm(sk->dst_cache);
3477 if (!sk->dead) {
3478 /* Wake up lingering close() */
3479 sk->state_change(sk);
3480 } else {
3481 int tmo;
3483 if (tp->linger2 < 0 ||
3484 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3485 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
3486 tcp_done(sk);
3487 return 1;
3490 tmo = tcp_fin_time(tp);
3491 if (tmo > TCP_TIMEWAIT_LEN) {
3492 tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
3493 } else if (th->fin || sk->lock.users) {
3494 /* Bad case. We could lose such FIN otherwise.
3495 * It is not a big problem, but it looks confusing
3496 * and not so rare event. We still can lose it now,
3497 * if it spins in bh_lock_sock(), but it is really
3498 * marginal case.
3500 tcp_reset_keepalive_timer(sk, tmo);
3501 } else {
3502 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
3503 goto discard;
3507 break;
3509 case TCP_CLOSING:
3510 if (tp->snd_una == tp->write_seq) {
3511 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3512 goto discard;
3514 break;
3516 case TCP_LAST_ACK:
3517 if (tp->snd_una == tp->write_seq) {
3518 tcp_update_metrics(sk);
3519 tcp_done(sk);
3520 goto discard;
3522 break;
3524 } else
3525 goto discard;
3527 step6:
3528 /* step 6: check the URG bit */
3529 tcp_urg(sk, th, len);
3531 /* step 7: process the segment text */
3532 switch (sk->state) {
3533 case TCP_CLOSE_WAIT:
3534 case TCP_CLOSING:
3535 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
3536 break;
3537 case TCP_FIN_WAIT1:
3538 case TCP_FIN_WAIT2:
3539 /* RFC 793 says to queue data in these states,
3540 * RFC 1122 says we MUST send a reset.
3541 * BSD 4.4 also does reset.
3543 if (sk->shutdown & RCV_SHUTDOWN) {
3544 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3545 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
3546 tcp_reset(sk);
3547 return 1;
3550 /* Fall through */
3551 case TCP_ESTABLISHED:
3552 tcp_data(skb, sk, len);
3553 queued = 1;
3554 break;
3557 /* tcp_data could move socket to TIME-WAIT */
3558 if (sk->state != TCP_CLOSE) {
3559 tcp_data_snd_check(sk);
3560 tcp_ack_snd_check(sk);
3561 if (tp->sorry)
3562 tcp_new_space(sk);
3565 if (!queued) {
3566 discard:
3567 kfree_skb(skb);
3569 return 0;