pre-2.3.4..
[davej-history.git] / net / ipv4 / tcp_input.c
blob1ebcf7f48de7d4f82d7398fd9ac0e6fb50b94531
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.165 1999/05/14 23:10:08 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
60 #include <linux/config.h>
61 #include <linux/mm.h>
62 #include <linux/sysctl.h>
63 #include <net/tcp.h>
64 #include <linux/ipsec.h>
66 #ifdef CONFIG_SYSCTL
67 #define SYNC_INIT 0 /* let the user enable it */
68 #else
69 #define SYNC_INIT 1
70 #endif
72 extern int sysctl_tcp_fin_timeout;
74 /* These are on by default so the code paths get tested.
75 * For the final 2.2 this may be undone at our discretion. -DaveM
77 int sysctl_tcp_timestamps = 1;
78 int sysctl_tcp_window_scaling = 1;
79 int sysctl_tcp_sack = 1;
81 int sysctl_tcp_syncookies = SYNC_INIT;
82 int sysctl_tcp_stdurg;
83 int sysctl_tcp_rfc1337;
85 static int prune_queue(struct sock *sk);
87 /* There is something which you must keep in mind when you analyze the
88 * behavior of the tp->ato delayed ack timeout interval. When a
89 * connection starts up, we want to ack as quickly as possible. The
90 * problem is that "good" TCP's do slow start at the beginning of data
91 * transmission. The means that until we send the first few ACK's the
92 * sender will sit on his end and only queue most of his data, because
93 * he can only send snd_cwnd unacked packets at any given time. For
94 * each ACK we send, he increments snd_cwnd and transmits more of his
95 * queue. -DaveM
97 static void tcp_delack_estimator(struct tcp_opt *tp)
99 if(tp->ato == 0) {
100 tp->lrcvtime = tcp_time_stamp;
102 /* Help sender leave slow start quickly,
103 * and also makes sure we do not take this
104 * branch ever again for this connection.
106 tp->ato = 1;
107 tcp_enter_quickack_mode(tp);
108 } else {
109 int m = tcp_time_stamp - tp->lrcvtime;
111 tp->lrcvtime = tcp_time_stamp;
112 if(m <= 0)
113 m = 1;
114 if(m > tp->rto)
115 tp->ato = tp->rto;
116 else {
117 /* This funny shift makes sure we
118 * clear the "quick ack mode" bit.
120 tp->ato = ((tp->ato << 1) >> 2) + m;
126 * Remember to send an ACK later.
128 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
129 struct sk_buff *skb)
131 tp->delayed_acks++;
133 /* Tiny-grams with PSH set artifically deflate our
134 * ato measurement, but with a lower bound.
136 if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
137 /* Preserve the quickack state. */
138 if((tp->ato & 0x7fffffff) > HZ/50)
139 tp->ato = ((tp->ato & 0x80000000) |
140 (HZ/50));
144 /* Called to compute a smoothed rtt estimate. The data fed to this
145 * routine either comes from timestamps, or from segments that were
146 * known _not_ to have been retransmitted [see Karn/Partridge
147 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
148 * piece by Van Jacobson.
149 * NOTE: the next three routines used to be one big routine.
150 * To save cycles in the RFC 1323 implementation it was better to break
151 * it up into three procedures. -- erics
154 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
156 long m = mrtt; /* RTT */
158 /* The following amusing code comes from Jacobson's
159 * article in SIGCOMM '88. Note that rtt and mdev
160 * are scaled versions of rtt and mean deviation.
161 * This is designed to be as fast as possible
162 * m stands for "measurement".
164 * On a 1990 paper the rto value is changed to:
165 * RTO = rtt + 4 * mdev
167 if(m == 0)
168 m = 1;
169 if (tp->srtt != 0) {
170 m -= (tp->srtt >> 3); /* m is now error in rtt est */
171 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
172 if (m < 0)
173 m = -m; /* m is now abs(error) */
174 m -= (tp->mdev >> 2); /* similar update on mdev */
175 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
176 } else {
177 /* no previous measure. */
178 tp->srtt = m<<3; /* take the measured time to be rtt */
179 tp->mdev = m<<2; /* make sure rto = 3*rtt */
183 /* Calculate rto without backoff. This is the second half of Van Jacobson's
184 * routine referred to above.
187 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
189 tp->rto = (tp->srtt >> 3) + tp->mdev;
190 tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
194 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
195 * on packet lifetime in the internet. We need the HZ/5 lower
196 * bound to behave correctly against BSD stacks with a fixed
197 * delayed ack.
198 * FIXME: It's not entirely clear this lower bound is the best
199 * way to avoid the problem. Is it possible to drop the lower
200 * bound and still avoid trouble with BSD stacks? Perhaps
201 * some modification to the RTO calculation that takes delayed
202 * ack bias into account? This needs serious thought. -- erics
204 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
206 if (tp->rto > 120*HZ)
207 tp->rto = 120*HZ;
208 if (tp->rto < HZ/5)
209 tp->rto = HZ/5;
212 /* WARNING: this must not be called if tp->saw_timestamp was false. */
213 extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
214 __u32 start_seq, __u32 end_seq)
216 /* From draft-ietf-tcplw-high-performance: the correct
217 * test is last_ack_sent <= end_seq.
218 * (RFC1323 stated last_ack_sent < end_seq.)
220 * HOWEVER: The current check contradicts the draft statements.
221 * It has been done for good reasons.
222 * The implemented check improves security and eliminates
223 * unnecessary RTT overestimation.
224 * 1998/06/27 Andrey V. Savochkin <saw@msu.ru>
226 if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
227 !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
228 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
229 * extra check below makes sure this can only happen
230 * for pure ACK frames. -DaveM
232 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
233 tp->ts_recent = tp->rcv_tsval;
234 tp->ts_recent_stamp = tcp_time_stamp;
239 #define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
241 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
243 /* ts_recent must be younger than 24 days */
244 return (((s32)(tcp_time_stamp - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
245 (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) &&
246 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
247 (len != (th->doff * 4))));
251 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
253 u32 end_window = tp->rcv_wup + tp->rcv_wnd;
255 if (tp->rcv_wnd &&
256 after(end_seq, tp->rcv_nxt) &&
257 before(seq, end_window))
258 return 1;
259 if (seq != end_window)
260 return 0;
261 return (seq == end_seq);
264 /* This functions checks to see if the tcp header is actually acceptable. */
265 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
267 if (seq == tp->rcv_nxt)
268 return (tp->rcv_wnd || (end_seq == seq));
270 return __tcp_sequence(tp, seq, end_seq);
273 /* When we get a reset we do this. */
274 static void tcp_reset(struct sock *sk)
276 sk->zapped = 1;
278 /* We want the right error as BSD sees it (and indeed as we do). */
279 switch (sk->state) {
280 case TCP_SYN_SENT:
281 sk->err = ECONNREFUSED;
282 break;
283 case TCP_CLOSE_WAIT:
284 sk->err = EPIPE;
285 break;
286 default:
287 sk->err = ECONNRESET;
289 tcp_set_state(sk, TCP_CLOSE);
290 sk->shutdown = SHUTDOWN_MASK;
291 if (!sk->dead)
292 sk->state_change(sk);
295 /* This tags the retransmission queue when SACKs arrive. */
296 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
298 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
299 int i = nsacks;
301 while(i--) {
302 struct sk_buff *skb = skb_peek(&sk->write_queue);
303 __u32 start_seq = ntohl(sp->start_seq);
304 __u32 end_seq = ntohl(sp->end_seq);
305 int fack_count = 0;
307 while((skb != NULL) &&
308 (skb != tp->send_head) &&
309 (skb != (struct sk_buff *)&sk->write_queue)) {
310 /* The retransmission queue is always in order, so
311 * we can short-circuit the walk early.
313 if(after(TCP_SKB_CB(skb)->seq, end_seq))
314 break;
316 /* We play conservative, we don't allow SACKS to partially
317 * tag a sequence space.
319 fack_count++;
320 if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
321 !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
322 /* If this was a retransmitted frame, account for it. */
323 if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
324 tp->retrans_out)
325 tp->retrans_out--;
326 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
328 /* RULE: All new SACKs will either decrease retrans_out
329 * or advance fackets_out.
331 if(fack_count > tp->fackets_out)
332 tp->fackets_out = fack_count;
334 skb = skb->next;
336 sp++; /* Move on to the next SACK block. */
340 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
341 * But, this can also be called on packets in the established flow when
342 * the fast version below fails.
344 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
346 unsigned char *ptr;
347 int length=(th->doff*4)-sizeof(struct tcphdr);
348 int saw_mss = 0;
350 ptr = (unsigned char *)(th + 1);
351 tp->saw_tstamp = 0;
353 while(length>0) {
354 int opcode=*ptr++;
355 int opsize;
357 switch (opcode) {
358 case TCPOPT_EOL:
359 return;
360 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
361 length--;
362 continue;
363 default:
364 opsize=*ptr++;
365 if (opsize < 2) /* "silly options" */
366 return;
367 if (opsize > length)
368 break; /* don't parse partial options */
369 switch(opcode) {
370 case TCPOPT_MSS:
371 if(opsize==TCPOLEN_MSS && th->syn) {
372 u16 in_mss = ntohs(*(__u16 *)ptr);
373 if (in_mss == 0)
374 in_mss = 536;
375 if (tp->mss_clamp > in_mss)
376 tp->mss_clamp = in_mss;
377 saw_mss = 1;
379 break;
380 case TCPOPT_WINDOW:
381 if(opsize==TCPOLEN_WINDOW && th->syn)
382 if (!no_fancy && sysctl_tcp_window_scaling) {
383 tp->wscale_ok = 1;
384 tp->snd_wscale = *(__u8 *)ptr;
385 if(tp->snd_wscale > 14) {
386 if(net_ratelimit())
387 printk("tcp_parse_options: Illegal window "
388 "scaling value %d >14 received.",
389 tp->snd_wscale);
390 tp->snd_wscale = 14;
393 break;
394 case TCPOPT_TIMESTAMP:
395 if(opsize==TCPOLEN_TIMESTAMP) {
396 if (sysctl_tcp_timestamps && !no_fancy) {
397 tp->tstamp_ok = 1;
398 tp->saw_tstamp = 1;
399 tp->rcv_tsval = ntohl(*(__u32 *)ptr);
400 tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
403 break;
404 case TCPOPT_SACK_PERM:
405 if(opsize==TCPOLEN_SACK_PERM && th->syn) {
406 if (sysctl_tcp_sack && !no_fancy) {
407 tp->sack_ok = 1;
408 tp->num_sacks = 0;
411 break;
413 case TCPOPT_SACK:
414 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
415 sysctl_tcp_sack && (sk != NULL) && !th->syn) {
416 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
418 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
419 int num_sacks = sack_bytes >> 3;
420 struct tcp_sack_block *sackp;
422 sackp = (struct tcp_sack_block *)ptr;
423 tcp_sacktag_write_queue(sk, sackp, num_sacks);
427 ptr+=opsize-2;
428 length-=opsize;
431 if(th->syn && saw_mss == 0)
432 tp->mss_clamp = 536;
435 /* Fast parse options. This hopes to only see timestamps.
436 * If it is wrong it falls back on tcp_parse_options().
438 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
440 /* If we didn't send out any options ignore them all. */
441 if (tp->tcp_header_len == sizeof(struct tcphdr))
442 return 0;
443 if (th->doff == sizeof(struct tcphdr)>>2) {
444 tp->saw_tstamp = 0;
445 return 0;
446 } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
447 __u32 *ptr = (__u32 *)(th + 1);
448 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
449 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
450 tp->saw_tstamp = 1;
451 tp->rcv_tsval = ntohl(*++ptr);
452 tp->rcv_tsecr = ntohl(*++ptr);
453 return 1;
456 tcp_parse_options(sk, th, tp, 0);
457 return 1;
460 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
461 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
462 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
463 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
465 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
467 if (tp->dup_acks > 3)
468 tp->snd_cwnd = (tp->snd_ssthresh);
470 tp->dup_acks = 0;
473 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
474 * retransmit timer fires.
476 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
478 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
480 /* Note: If not_dup is set this implies we got a
481 * data carrying packet or a window update.
482 * This carries no new information about possible
483 * lost packets, so we have to ignore it for the purposes
484 * of counting duplicate acks. Ideally this does not imply we
485 * should stop our fast retransmit phase, more acks may come
486 * later without data to help us. Unfortunately this would make
487 * the code below much more complex. For now if I see such
488 * a packet I clear the fast retransmit phase.
490 if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
491 /* This is the standard reno style fast retransmit branch. */
493 /* 1. When the third duplicate ack is received, set ssthresh
494 * to one half the current congestion window, but no less
495 * than two segments. Retransmit the missing segment.
497 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
498 tp->dup_acks++;
499 if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
500 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
501 tp->snd_cwnd = (tp->snd_ssthresh + 3);
502 tp->high_seq = tp->snd_nxt;
503 if(!tp->fackets_out)
504 tcp_retransmit_skb(sk,
505 skb_peek(&sk->write_queue));
506 else
507 tcp_fack_retransmit(sk);
508 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
510 } else if (++tp->dup_acks > 3) {
511 /* 2. Each time another duplicate ACK arrives, increment
512 * cwnd by the segment size. [...] Transmit a packet...
514 * Packet transmission will be done on normal flow processing
515 * since we're not in "retransmit mode". We do not use
516 * duplicate ACKs to artificially inflate the congestion
517 * window when doing FACK.
519 if(!tp->fackets_out) {
520 tp->snd_cwnd++;
521 } else {
522 /* Fill any further holes which may have
523 * appeared.
525 * We may want to change this to run every
526 * further multiple-of-3 dup ack increments,
527 * to be more robust against out-of-order
528 * packet delivery. -DaveM
530 tcp_fack_retransmit(sk);
533 } else if (tp->high_seq != 0) {
534 /* In this branch we deal with clearing the Floyd style
535 * block on duplicate fast retransmits, and if requested
536 * we do Hoe style secondary fast retransmits.
538 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
539 /* Once we have acked all the packets up to high_seq
540 * we are done this fast retransmit phase.
541 * Alternatively data arrived. In this case we
542 * Have to abort the fast retransmit attempt.
543 * Note that we do want to accept a window
544 * update since this is expected with Hoe's algorithm.
546 clear_fast_retransmit(tp);
548 /* After we have cleared up to high_seq we can
549 * clear the Floyd style block.
551 if (!before(ack, tp->high_seq)) {
552 tp->high_seq = 0;
553 tp->fackets_out = 0;
555 } else if (tp->dup_acks >= 3) {
556 if (!tp->fackets_out) {
557 /* Hoe Style. We didn't ack the whole
558 * window. Take this as a cue that
559 * another packet was lost and retransmit it.
560 * Don't muck with the congestion window here.
561 * Note that we have to be careful not to
562 * act if this was a window update and it
563 * didn't ack new data, since this does
564 * not indicate a packet left the system.
565 * We can test this by just checking
566 * if ack changed from snd_una, since
567 * the only way to get here without advancing
568 * from snd_una is if this was a window update.
570 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
571 tcp_retransmit_skb(sk,
572 skb_peek(&sk->write_queue));
573 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
575 } else {
576 /* FACK style, fill any remaining holes in
577 * receiver's queue.
579 tcp_fack_retransmit(sk);
585 /* This is Jacobson's slow start and congestion avoidance.
586 * SIGCOMM '88, p. 328.
588 static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
590 if (tp->snd_cwnd <= tp->snd_ssthresh) {
591 /* In "safe" area, increase. */
592 tp->snd_cwnd++;
593 } else {
594 /* In dangerous area, increase slowly.
595 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
597 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
598 tp->snd_cwnd++;
599 tp->snd_cwnd_cnt=0;
600 } else
601 tp->snd_cwnd_cnt++;
605 /* Remove acknowledged frames from the retransmission queue. */
606 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
607 __u32 *seq, __u32 *seq_rtt)
609 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
610 struct sk_buff *skb;
611 __u32 now = tcp_time_stamp;
612 int acked = 0;
614 /* If we are retransmitting, and this ACK clears up to
615 * the retransmit head, or further, then clear our state.
617 if (tp->retrans_head != NULL &&
618 !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
619 tp->retrans_head = NULL;
621 while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
622 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
623 __u8 sacked = scb->sacked;
625 /* If our packet is before the ack sequence we can
626 * discard it as it's confirmed to have arrived at
627 * the other end.
629 if (after(scb->end_seq, ack))
630 break;
632 /* Initial outgoing SYN's get put onto the write_queue
633 * just like anything else we transmit. It is not
634 * true data, and if we misinform our callers that
635 * this ACK acks real data, we will erroneously exit
636 * connection startup slow start one packet too
637 * quickly. This is severely frowned upon behavior.
639 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
640 tp->retrans_out--;
641 if(!(scb->flags & TCPCB_FLAG_SYN)) {
642 acked |= FLAG_DATA_ACKED;
643 if(sacked & TCPCB_SACKED_RETRANS)
644 acked |= FLAG_RETRANS_DATA_ACKED;
645 if(tp->fackets_out)
646 tp->fackets_out--;
647 } else {
648 /* This is pure paranoia. */
649 tp->retrans_head = NULL;
651 tp->packets_out--;
652 *seq = scb->seq;
653 *seq_rtt = now - scb->when;
654 __skb_unlink(skb, skb->list);
655 kfree_skb(skb);
657 return acked;
660 static void tcp_ack_probe(struct sock *sk, __u32 ack)
662 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
664 /* Our probe was answered. */
665 tp->probes_out = 0;
667 /* Was it a usable window open? */
669 /* should always be non-null */
670 if (tp->send_head != NULL &&
671 !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
672 tp->backoff = 0;
673 tp->pending = 0;
674 tcp_clear_xmit_timer(sk, TIME_PROBE0);
675 } else {
676 tcp_reset_xmit_timer(sk, TIME_PROBE0,
677 min(tp->rto << tp->backoff, 120*HZ));
681 /* Should we open up the congestion window? */
682 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
684 /* Data must have been acked. */
685 if ((flag & FLAG_DATA_ACKED) == 0)
686 return 0;
688 /* Some of the data acked was retransmitted somehow? */
689 if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
690 /* We advance in all cases except during
691 * non-FACK fast retransmit/recovery.
693 if (tp->fackets_out != 0 ||
694 tp->retransmits != 0)
695 return 1;
697 /* Non-FACK fast retransmit does it's own
698 * congestion window management, don't get
699 * in the way.
701 return 0;
704 /* New non-retransmitted data acked, always advance. */
705 return 1;
708 /* Read draft-ietf-tcplw-high-performance before mucking
709 * with this code. (Superceeds RFC1323)
711 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
712 u32 seq, u32 ack, int flag)
714 __u32 seq_rtt;
716 /* RTTM Rule: A TSecr value received in a segment is used to
717 * update the averaged RTT measurement only if the segment
718 * acknowledges some new data, i.e., only if it advances the
719 * left edge of the send window.
721 * See draft-ietf-tcplw-high-performance-00, section 3.3.
722 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
724 if (!(flag & FLAG_DATA_ACKED))
725 return;
727 seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
728 tcp_rtt_estimator(tp, seq_rtt);
729 if (tp->retransmits) {
730 if (tp->packets_out == 0) {
731 tp->retransmits = 0;
732 tp->fackets_out = 0;
733 tp->retrans_out = 0;
734 tp->backoff = 0;
735 tcp_set_rto(tp);
736 } else {
737 /* Still retransmitting, use backoff */
738 tcp_set_rto(tp);
739 tp->rto = tp->rto << tp->backoff;
741 } else {
742 tcp_set_rto(tp);
745 tcp_bound_rto(tp);
748 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
750 struct sk_buff *skb = skb_peek(&sk->write_queue);
752 /* Some data was ACK'd, if still retransmitting (due to a
753 * timeout), resend more of the retransmit queue. The
754 * congestion window is handled properly by that code.
756 if (tp->retransmits) {
757 tcp_xmit_retransmit_queue(sk);
758 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
759 } else {
760 __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
761 if ((__s32)when < 0)
762 when = 1;
763 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
767 /* This routine deals with incoming acks, but not outgoing ones. */
768 static int tcp_ack(struct sock *sk, struct tcphdr *th,
769 u32 ack_seq, u32 ack, int len)
771 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
772 int flag = 0;
773 u32 seq = 0;
774 u32 seq_rtt = 0;
776 if(sk->zapped)
777 return(1); /* Dead, can't ack any more so why bother */
779 if (tp->pending == TIME_KEEPOPEN)
780 tp->probes_out = 0;
782 tp->rcv_tstamp = tcp_time_stamp;
784 /* If the ack is newer than sent or older than previous acks
785 * then we can probably ignore it.
787 if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
788 goto uninteresting_ack;
790 /* If there is data set flag 1 */
791 if (len != th->doff*4) {
792 flag |= FLAG_DATA;
793 tcp_delack_estimator(tp);
796 /* Update our send window. */
798 /* This is the window update code as per RFC 793
799 * snd_wl{1,2} are used to prevent unordered
800 * segments from shrinking the window
802 if (before(tp->snd_wl1, ack_seq) ||
803 (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
804 u32 nwin = ntohs(th->window) << tp->snd_wscale;
806 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
807 flag |= FLAG_WIN_UPDATE;
808 tp->snd_wnd = nwin;
810 tp->snd_wl1 = ack_seq;
811 tp->snd_wl2 = ack;
813 if (nwin > tp->max_window)
814 tp->max_window = nwin;
818 /* We passed data and got it acked, remove any soft error
819 * log. Something worked...
821 sk->err_soft = 0;
823 /* If this ack opens up a zero window, clear backoff. It was
824 * being used to time the probes, and is probably far higher than
825 * it needs to be for normal retransmission.
827 if (tp->pending == TIME_PROBE0)
828 tcp_ack_probe(sk, ack);
830 /* See if we can take anything off of the retransmit queue. */
831 flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
833 /* We must do this here, before code below clears out important
834 * state contained in tp->fackets_out and tp->retransmits. -DaveM
836 if (should_advance_cwnd(tp, flag))
837 tcp_cong_avoid(tp);
839 /* If we have a timestamp, we always do rtt estimates. */
840 if (tp->saw_tstamp) {
841 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
842 } else {
843 /* If we were retransmiting don't count rtt estimate. */
844 if (tp->retransmits) {
845 if (tp->packets_out == 0) {
846 tp->retransmits = 0;
847 tp->fackets_out = 0;
848 tp->retrans_out = 0;
850 } else {
851 /* We don't have a timestamp. Can only use
852 * packets that are not retransmitted to determine
853 * rtt estimates. Also, we must not reset the
854 * backoff for rto until we get a non-retransmitted
855 * packet. This allows us to deal with a situation
856 * where the network delay has increased suddenly.
857 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
859 if (flag & FLAG_DATA_ACKED) {
860 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
861 tp->backoff = 0;
862 tcp_rtt_estimator(tp, seq_rtt);
863 tcp_set_rto(tp);
864 tcp_bound_rto(tp);
870 if (tp->packets_out) {
871 if (flag & FLAG_DATA_ACKED)
872 tcp_ack_packets_out(sk, tp);
873 } else {
874 tcp_clear_xmit_timer(sk, TIME_RETRANS);
877 flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
878 if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
879 (tp->high_seq != 0)) {
880 tcp_fast_retrans(sk, ack, flag);
881 } else {
882 /* Clear any aborted fast retransmit starts. */
883 tp->dup_acks = 0;
885 /* It is not a brain fart, I thought a bit now. 8)
887 * Forward progress is indicated, if:
888 * 1. the ack acknowledges new data.
889 * 2. or the ack is duplicate, but it is caused by new segment
890 * arrival. This case is filtered by:
891 * - it contains no data, syn or fin.
892 * - it does not update window.
893 * 3. or new SACK. It is difficult to check, so that we ignore it.
895 * Forward progress is also indicated by arrival new data,
896 * which was caused by window open from our side. This case is more
897 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
898 * --ANK (990513)
900 if (ack != tp->snd_una || (flag == 0 && !th->fin))
901 dst_confirm(sk->dst_cache);
903 /* Remember the highest ack received. */
904 tp->snd_una = ack;
905 return 1;
907 uninteresting_ack:
908 SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
909 return 0;
912 /* New-style handling of TIME_WAIT sockets. */
913 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
914 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
915 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
917 /* Must be called only from BH context. */
918 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
920 SOCKHASH_LOCK_WRITE_BH();
922 /* Unlink from various places. */
923 if(tw->bind_next)
924 tw->bind_next->bind_pprev = tw->bind_pprev;
925 *(tw->bind_pprev) = tw->bind_next;
926 if(tw->tb->owners == NULL)
927 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
929 if(tw->next)
930 tw->next->pprev = tw->pprev;
931 *tw->pprev = tw->next;
933 /* We decremented the prot->inuse count when we entered TIME_WAIT
934 * and the sock from which this came was destroyed.
936 tw->sklist_next->sklist_prev = tw->sklist_prev;
937 tw->sklist_prev->sklist_next = tw->sklist_next;
939 SOCKHASH_UNLOCK_WRITE_BH();
941 /* Ok, now free it up. */
942 kmem_cache_free(tcp_timewait_cachep, tw);
945 /* We come here as a special case from the AF specific TCP input processing,
946 * and the SKB has no owner. Essentially handling this is very simple,
947 * we just keep silently eating rx'd packets until none show up for the
948 * entire timeout period. The only special cases are for BSD TIME_WAIT
949 * reconnects and SYN/RST bits being set in the TCP header.
951 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
952 struct tcphdr *th, unsigned len)
954 /* RFC 1122:
955 * "When a connection is [...] on TIME-WAIT state [...]
956 * [a TCP] MAY accept a new SYN from the remote TCP to
957 * reopen the connection directly, if it:
959 * (1) assigns its initial sequence number for the new
960 * connection to be larger than the largest sequence
961 * number it used on the previous connection incarnation,
962 * and
964 * (2) returns to TIME-WAIT state if the SYN turns out
965 * to be an old duplicate".
967 if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
968 struct sock *sk;
969 struct tcp_func *af_specific = tw->af_specific;
970 __u32 isn;
971 int ret;
973 isn = tw->rcv_nxt + 128000;
974 if(isn == 0)
975 isn++;
976 tcp_tw_deschedule(tw);
977 tcp_timewait_kill(tw);
978 sk = af_specific->get_sock(skb, th);
979 if(sk == NULL ||
980 !ipsec_sk_policy(sk,skb))
981 return 0;
983 bh_lock_sock(sk);
985 /* Default is to discard the frame. */
986 ret = 0;
988 if(sk->lock.users)
989 goto out_unlock;
991 skb_set_owner_r(skb, sk);
992 af_specific = sk->tp_pinfo.af_tcp.af_specific;
994 if(af_specific->conn_request(sk, skb, isn) < 0)
995 ret = 1; /* Toss a reset back. */
996 out_unlock:
997 bh_unlock_sock(sk);
998 return ret;
1001 /* Check RST or SYN */
1002 if(th->rst || th->syn) {
1003 /* This is TIME_WAIT assasination, in two flavors.
1004 * Oh well... nobody has a sufficient solution to this
1005 * protocol bug yet.
1007 if(sysctl_tcp_rfc1337 == 0) {
1008 tcp_tw_deschedule(tw);
1009 tcp_timewait_kill(tw);
1011 if(!th->rst)
1012 return 1; /* toss a reset back */
1013 } else {
1014 /* In this case we must reset the TIMEWAIT timer. */
1015 if(th->ack)
1016 tcp_tw_reschedule(tw);
1018 return 0; /* Discard the frame. */
1021 /* Enter the time wait state. This is always called from BH
1022 * context. Essentially we whip up a timewait bucket, copy the
1023 * relevant info into it from the SK, and mess with hash chains
1024 * and list linkage.
1026 static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
1028 struct sock **head, *sktw;
1030 /* Step 1: Remove SK from established hash. */
1031 if(sk->next)
1032 sk->next->pprev = sk->pprev;
1033 *sk->pprev = sk->next;
1034 sk->pprev = NULL;
1035 tcp_reg_zap(sk);
1037 /* Step 2: Put TW into bind hash where SK was. */
1038 tw->tb = (struct tcp_bind_bucket *)sk->prev;
1039 if((tw->bind_next = sk->bind_next) != NULL)
1040 sk->bind_next->bind_pprev = &tw->bind_next;
1041 tw->bind_pprev = sk->bind_pprev;
1042 *sk->bind_pprev = (struct sock *)tw;
1044 /* Step 3: Same for the protocol sklist. */
1045 (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
1046 (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
1047 sk->sklist_next = NULL;
1048 sk->prot->inuse--;
1050 /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
1051 head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
1052 sktw = (struct sock *)tw;
1053 if((sktw->next = *head) != NULL)
1054 (*head)->pprev = &sktw->next;
1055 *head = sktw;
1056 sktw->pprev = head;
1059 void tcp_time_wait(struct sock *sk)
1061 struct tcp_tw_bucket *tw;
1063 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1064 if(tw != NULL) {
1065 /* Give us an identity. */
1066 tw->daddr = sk->daddr;
1067 tw->rcv_saddr = sk->rcv_saddr;
1068 tw->bound_dev_if= sk->bound_dev_if;
1069 tw->num = sk->num;
1070 tw->state = TCP_TIME_WAIT;
1071 tw->sport = sk->sport;
1072 tw->dport = sk->dport;
1073 tw->family = sk->family;
1074 tw->reuse = sk->reuse;
1075 tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
1076 tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
1078 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1079 if(tw->family == PF_INET6) {
1080 memcpy(&tw->v6_daddr,
1081 &sk->net_pinfo.af_inet6.daddr,
1082 sizeof(struct in6_addr));
1083 memcpy(&tw->v6_rcv_saddr,
1084 &sk->net_pinfo.af_inet6.rcv_saddr,
1085 sizeof(struct in6_addr));
1087 #endif
1088 /* Linkage updates. */
1089 SOCKHASH_LOCK_WRITE();
1090 tcp_tw_hashdance(sk, tw);
1091 SOCKHASH_UNLOCK_WRITE();
1093 /* Get the TIME_WAIT timeout firing. */
1094 tcp_tw_schedule(tw);
1096 /* CLOSE the SK. */
1097 if(sk->state == TCP_ESTABLISHED)
1098 tcp_statistics.TcpCurrEstab--;
1099 sk->state = TCP_CLOSE;
1100 net_reset_timer(sk, TIME_DONE,
1101 min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1102 } else {
1103 /* Sorry, we're out of memory, just CLOSE this
1104 * socket up. We've got bigger problems than
1105 * non-graceful socket closings.
1107 tcp_set_state(sk, TCP_CLOSE);
1110 /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1111 sk->shutdown = SHUTDOWN_MASK;
1112 if(!sk->dead)
1113 sk->state_change(sk);
1117 * Process the FIN bit. This now behaves as it is supposed to work
1118 * and the FIN takes effect when it is validly part of sequence
1119 * space. Not before when we get holes.
1121 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1122 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1123 * TIME-WAIT)
1125 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1126 * close and we go into CLOSING (and later onto TIME-WAIT)
1128 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1131 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1133 sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1135 tcp_send_ack(sk);
1137 if (!sk->dead) {
1138 sk->state_change(sk);
1139 sock_wake_async(sk->socket, 1);
1142 switch(sk->state) {
1143 case TCP_SYN_RECV:
1144 case TCP_ESTABLISHED:
1145 /* Move to CLOSE_WAIT */
1146 tcp_set_state(sk, TCP_CLOSE_WAIT);
1147 if (th->rst)
1148 sk->shutdown = SHUTDOWN_MASK;
1149 break;
1151 case TCP_CLOSE_WAIT:
1152 case TCP_CLOSING:
1153 /* Received a retransmission of the FIN, do
1154 * nothing.
1156 break;
1157 case TCP_LAST_ACK:
1158 /* RFC793: Remain in the LAST-ACK state. */
1159 break;
1161 case TCP_FIN_WAIT1:
1162 /* This case occurs when a simultaneous close
1163 * happens, we must ack the received FIN and
1164 * enter the CLOSING state.
1166 * This causes a WRITE timeout, which will either
1167 * move on to TIME_WAIT when we timeout, or resend
1168 * the FIN properly (maybe we get rid of that annoying
1169 * FIN lost hang). The TIME_WRITE code is already
1170 * correct for handling this timeout.
1172 tcp_set_state(sk, TCP_CLOSING);
1173 break;
1174 case TCP_FIN_WAIT2:
1175 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1176 tcp_time_wait(sk);
1177 break;
1178 default:
1179 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1180 * cases we should never reach this piece of code.
1182 printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1183 break;
1187 /* These routines update the SACK block as out-of-order packets arrive or
1188 * in-order packets close up the sequence space.
1190 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1192 int this_sack, num_sacks = tp->num_sacks;
1193 struct tcp_sack_block *swalk = &tp->selective_acks[0];
1195 /* If more than one SACK block, see if the recent change to SP eats into
1196 * or hits the sequence space of other SACK blocks, if so coalesce.
1198 if(num_sacks != 1) {
1199 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1200 if(swalk == sp)
1201 continue;
1203 /* First case, bottom of SP moves into top of the
1204 * sequence space of SWALK.
1206 if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1207 sp->start_seq = swalk->start_seq;
1208 goto coalesce;
1210 /* Second case, top of SP moves into bottom of the
1211 * sequence space of SWALK.
1213 if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1214 sp->end_seq = swalk->end_seq;
1215 goto coalesce;
1219 /* SP is the only SACK, or no coalescing cases found. */
1220 return;
1222 coalesce:
1223 /* Zap SWALK, by moving every further SACK up by one slot.
1224 * Decrease num_sacks.
1226 for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1227 struct tcp_sack_block *next = (swalk + 1);
1228 swalk->start_seq = next->start_seq;
1229 swalk->end_seq = next->end_seq;
1231 tp->num_sacks--;
1234 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1236 __u32 tmp;
1238 tmp = sack1->start_seq;
1239 sack1->start_seq = sack2->start_seq;
1240 sack2->start_seq = tmp;
1242 tmp = sack1->end_seq;
1243 sack1->end_seq = sack2->end_seq;
1244 sack2->end_seq = tmp;
1247 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1249 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1250 struct tcp_sack_block *sp = &tp->selective_acks[0];
1251 int cur_sacks = tp->num_sacks;
1253 if (!cur_sacks)
1254 goto new_sack;
1256 /* Optimize for the common case, new ofo frames arrive
1257 * "in order". ;-) This also satisfies the requirements
1258 * of RFC2018 about ordering of SACKs.
1260 if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1261 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1262 tcp_sack_maybe_coalesce(tp, sp);
1263 } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1264 /* Re-ordered arrival, in this case, can be optimized
1265 * as well.
1267 sp->start_seq = TCP_SKB_CB(skb)->seq;
1268 tcp_sack_maybe_coalesce(tp, sp);
1269 } else {
1270 struct tcp_sack_block *swap = sp + 1;
1271 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1273 /* Oh well, we have to move things around.
1274 * Try to find a SACK we can tack this onto.
1277 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1278 if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1279 (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1280 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1281 swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1282 else
1283 swap->start_seq = TCP_SKB_CB(skb)->seq;
1284 tcp_sack_swap(sp, swap);
1285 tcp_sack_maybe_coalesce(tp, sp);
1286 return;
1290 /* Could not find an adjacent existing SACK, build a new one,
1291 * put it at the front, and shift everyone else down. We
1292 * always know there is at least one SACK present already here.
1294 * If the sack array is full, forget about the last one.
1296 if (cur_sacks >= max_sacks) {
1297 cur_sacks--;
1298 tp->num_sacks--;
1300 while(cur_sacks >= 1) {
1301 struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1302 struct tcp_sack_block *prev = (this - 1);
1303 this->start_seq = prev->start_seq;
1304 this->end_seq = prev->end_seq;
1305 cur_sacks--;
1308 new_sack:
1309 /* Build the new head SACK, and we're done. */
1310 sp->start_seq = TCP_SKB_CB(skb)->seq;
1311 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1312 tp->num_sacks++;
1316 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1318 struct tcp_sack_block *sp = &tp->selective_acks[0];
1319 int num_sacks = tp->num_sacks;
1320 int this_sack;
1322 /* This is an in order data segment _or_ an out-of-order SKB being
1323 * moved to the receive queue, so we know this removed SKB will eat
1324 * from the front of a SACK.
1326 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1327 /* Check if the start of the sack is covered by skb. */
1328 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1329 before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1330 break;
1333 /* This should only happen if so many SACKs get built that some get
1334 * pushed out before we get here, or we eat some in sequence packets
1335 * which are before the first SACK block.
1337 if(this_sack >= num_sacks)
1338 return;
1340 sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1341 if(!before(sp->start_seq, sp->end_seq)) {
1342 /* Zap this SACK, by moving forward any other SACKS. */
1343 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1344 struct tcp_sack_block *next = (sp + 1);
1345 sp->start_seq = next->start_seq;
1346 sp->end_seq = next->end_seq;
1348 tp->num_sacks--;
1352 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1354 struct tcp_sack_block *sp = &tp->selective_acks[0];
1355 int num_sacks = tp->num_sacks;
1356 int this_sack;
1358 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1359 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1360 break;
1362 if(this_sack >= num_sacks)
1363 return;
1364 sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1367 /* This one checks to see if we can put data from the
1368 * out_of_order queue into the receive_queue.
1370 static void tcp_ofo_queue(struct sock *sk)
1372 struct sk_buff *skb;
1373 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1375 while ((skb = skb_peek(&tp->out_of_order_queue))) {
1376 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1377 break;
1379 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1380 SOCK_DEBUG(sk, "ofo packet was already received \n");
1381 __skb_unlink(skb, skb->list);
1382 kfree_skb(skb);
1383 continue;
1385 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1386 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1387 TCP_SKB_CB(skb)->end_seq);
1389 if(tp->sack_ok)
1390 tcp_sack_remove_skb(tp, skb);
1391 __skb_unlink(skb, skb->list);
1392 __skb_queue_tail(&sk->receive_queue, skb);
1393 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1394 if(skb->h.th->fin)
1395 tcp_fin(skb, sk, skb->h.th);
1399 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1401 struct sk_buff *skb1;
1402 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1404 /* Queue data for delivery to the user.
1405 * Packets in sequence go to the receive queue.
1406 * Out of sequence packets to the out_of_order_queue.
1408 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1409 /* Ok. In sequence. */
1410 queue_and_out:
1411 dst_confirm(sk->dst_cache);
1412 __skb_queue_tail(&sk->receive_queue, skb);
1413 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1414 if(skb->h.th->fin) {
1415 tcp_fin(skb, sk, skb->h.th);
1416 } else {
1417 tcp_remember_ack(tp, skb->h.th, skb);
1419 /* This may have eaten into a SACK block. */
1420 if(tp->sack_ok && tp->num_sacks)
1421 tcp_sack_remove_skb(tp, skb);
1422 tcp_ofo_queue(sk);
1424 /* Turn on fast path. */
1425 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1426 tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1427 (0x10 << 16) |
1428 tp->snd_wnd);
1429 return;
1432 /* An old packet, either a retransmit or some packet got lost. */
1433 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1434 /* A retransmit, 2nd most common case. Force an imediate ack. */
1435 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1436 tcp_enter_quickack_mode(tp);
1437 kfree_skb(skb);
1438 return;
1441 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1442 /* Partial packet, seq < rcv_next < end_seq */
1443 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1444 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1445 TCP_SKB_CB(skb)->end_seq);
1447 goto queue_and_out;
1450 /* Ok. This is an out_of_order segment, force an ack. */
1451 tp->delayed_acks++;
1452 tcp_enter_quickack_mode(tp);
1454 /* Disable header prediction. */
1455 tp->pred_flags = 0;
1457 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1458 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1460 if (skb_peek(&tp->out_of_order_queue) == NULL) {
1461 /* Initial out of order segment, build 1 SACK. */
1462 if(tp->sack_ok) {
1463 tp->num_sacks = 1;
1464 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1465 tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1467 __skb_queue_head(&tp->out_of_order_queue,skb);
1468 } else {
1469 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1470 /* Already there. */
1471 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1472 if (skb->len >= skb1->len) {
1473 if(tp->sack_ok)
1474 tcp_sack_extend(tp, skb1, skb);
1475 __skb_append(skb1, skb);
1476 __skb_unlink(skb1, skb1->list);
1477 kfree_skb(skb1);
1478 } else {
1479 /* A duplicate, smaller than what is in the
1480 * out-of-order queue right now, toss it.
1482 kfree_skb(skb);
1484 break;
1487 if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1488 __skb_append(skb1, skb);
1489 if(tp->sack_ok)
1490 tcp_sack_new_ofo_skb(sk, skb);
1491 break;
1494 /* See if we've hit the start. If so insert. */
1495 if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1496 __skb_queue_head(&tp->out_of_order_queue,skb);
1497 if(tp->sack_ok)
1498 tcp_sack_new_ofo_skb(sk, skb);
1499 break;
1507 * This routine handles the data. If there is room in the buffer,
1508 * it will be have already been moved into it. If there is no
1509 * room, then we will just have to discard the packet.
1512 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1514 struct tcphdr *th;
1515 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1517 th = skb->h.th;
1518 skb_pull(skb, th->doff*4);
1519 skb_trim(skb, len - (th->doff*4));
1521 if (skb->len == 0 && !th->fin)
1522 return(0);
1525 * If our receive queue has grown past its limits shrink it.
1526 * Make sure to do this before moving snd_nxt, otherwise
1527 * data might be acked for that we don't have enough room.
1529 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1530 if (prune_queue(sk) < 0) {
1531 /* Still not enough room. That can happen when
1532 * skb->true_size differs significantly from skb->len.
1534 return 0;
1538 tcp_data_queue(sk, skb);
1540 if (before(tp->rcv_nxt, tp->copied_seq)) {
1541 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1542 tp->rcv_nxt = tp->copied_seq;
1545 /* Above, tcp_data_queue() increments delayed_acks appropriately.
1546 * Now tell the user we may have some data.
1548 if (!sk->dead) {
1549 SOCK_DEBUG(sk, "Data wakeup.\n");
1550 sk->data_ready(sk,0);
1552 return(1);
1555 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1557 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1559 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1560 tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1561 /* Put more data onto the wire. */
1562 tcp_write_xmit(sk);
1563 } else if (tp->packets_out == 0 && !tp->pending) {
1564 /* Start probing the receivers window. */
1565 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1569 static __inline__ void tcp_data_snd_check(struct sock *sk)
1571 struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1573 if (skb != NULL)
1574 __tcp_data_snd_check(sk, skb);
1578 * Adapt the MSS value used to make delayed ack decision to the
1579 * real world.
1581 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1583 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1584 unsigned int len = skb->len, lss;
1586 if (len > tp->rcv_mss)
1587 tp->rcv_mss = len;
1588 lss = tp->last_seg_size;
1589 tp->last_seg_size = 0;
1590 if (len >= 536) {
1591 if (len == lss)
1592 tp->rcv_mss = len;
1593 tp->last_seg_size = len;
1598 * Check if sending an ack is needed.
1600 static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1602 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1604 /* This also takes care of updating the window.
1605 * This if statement needs to be simplified.
1607 * Rules for delaying an ack:
1608 * - delay time <= 0.5 HZ
1609 * - we don't have a window update to send
1610 * - must send at least every 2 full sized packets
1611 * - must send an ACK if we have any out of order data
1613 * With an extra heuristic to handle loss of packet
1614 * situations and also helping the sender leave slow
1615 * start in an expediant manner.
1618 /* Two full frames received or... */
1619 if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1620 /* We will update the window "significantly" or... */
1621 tcp_raise_window(sk) ||
1622 /* We entered "quick ACK" mode or... */
1623 tcp_in_quickack_mode(tp) ||
1624 /* We have out of order data */
1625 (skb_peek(&tp->out_of_order_queue) != NULL)) {
1626 /* Then ack it now */
1627 tcp_send_ack(sk);
1628 } else {
1629 /* Else, send delayed ack. */
1630 tcp_send_delayed_ack(tp, HZ/2);
1634 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1636 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1637 if (tp->delayed_acks == 0) {
1638 /* We sent a data segment already. */
1639 return;
1641 __tcp_ack_snd_check(sk);
1646 * This routine is only called when we have urgent data
1647 * signalled. Its the 'slow' part of tcp_urg. It could be
1648 * moved inline now as tcp_urg is only called from one
1649 * place. We handle URGent data wrong. We have to - as
1650 * BSD still doesn't use the correction from RFC961.
1651 * For 1003.1g we should support a new option TCP_STDURG to permit
1652 * either form (or just set the sysctl tcp_stdurg).
1655 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1657 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1658 u32 ptr = ntohs(th->urg_ptr);
1660 if (ptr && !sysctl_tcp_stdurg)
1661 ptr--;
1662 ptr += ntohl(th->seq);
1664 /* Ignore urgent data that we've already seen and read. */
1665 if (after(tp->copied_seq, ptr))
1666 return;
1668 /* Do we already have a newer (or duplicate) urgent pointer? */
1669 if (tp->urg_data && !after(ptr, tp->urg_seq))
1670 return;
1672 /* Tell the world about our new urgent pointer. */
1673 if (sk->proc != 0) {
1674 if (sk->proc > 0)
1675 kill_proc(sk->proc, SIGURG, 1);
1676 else
1677 kill_pg(-sk->proc, SIGURG, 1);
1680 /* We may be adding urgent data when the last byte read was
1681 * urgent. To do this requires some care. We cannot just ignore
1682 * tp->copied_seq since we would read the last urgent byte again
1683 * as data, nor can we alter copied_seq until this data arrives
1684 * or we break the sematics of SIOCATMARK (and thus sockatmark())
1686 if (tp->urg_seq == tp->copied_seq)
1687 tp->copied_seq++; /* Move the copied sequence on correctly */
1688 tp->urg_data = URG_NOTYET;
1689 tp->urg_seq = ptr;
1691 /* Disable header prediction. */
1692 tp->pred_flags = 0;
1695 /* This is the 'fast' part of urgent handling. */
1696 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1698 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1700 /* Check if we get a new urgent pointer - normally not. */
1701 if (th->urg)
1702 tcp_check_urg(sk,th);
1704 /* Do we wait for any urgent data? - normally not... */
1705 if (tp->urg_data == URG_NOTYET) {
1706 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1708 /* Is the urgent pointer pointing into this packet? */
1709 if (ptr < len) {
1710 tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1711 if (!sk->dead)
1712 sk->data_ready(sk,0);
1717 /* Clean the out_of_order queue if we can, trying to get
1718 * the socket within its memory limits again.
1720 * Return less than zero if we should start dropping frames
1721 * until the socket owning process reads some of the data
1722 * to stabilize the situation.
1724 static int prune_queue(struct sock *sk)
1726 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1727 struct sk_buff * skb;
1729 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1731 net_statistics.PruneCalled++;
1733 /* First, purge the out_of_order queue. */
1734 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1735 if(skb != NULL) {
1736 /* Free it all. */
1737 do { net_statistics.OfoPruned += skb->len;
1738 kfree_skb(skb);
1739 skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1740 } while(skb != NULL);
1742 /* Reset SACK state. A conforming SACK implementation will
1743 * do the same at a timeout based retransmit. When a connection
1744 * is in a sad state like this, we care only about integrity
1745 * of the connection not performance.
1747 if(tp->sack_ok)
1748 tp->num_sacks = 0;
1751 /* If we are really being abused, tell the caller to silently
1752 * drop receive data on the floor. It will get retransmitted
1753 * and hopefully then we'll have sufficient space.
1755 * We used to try to purge the in-order packets too, but that
1756 * turns out to be deadly and fraught with races. Consider:
1758 * 1) If we acked the data, we absolutely cannot drop the
1759 * packet. This data would then never be retransmitted.
1760 * 2) It is possible, with a proper sequence of events involving
1761 * delayed acks and backlog queue handling, to have the user
1762 * read the data before it gets acked. The previous code
1763 * here got this wrong, and it lead to data corruption.
1764 * 3) Too much state changes happen when the FIN arrives, so once
1765 * we've seen that we can't remove any in-order data safely.
1767 * The net result is that removing in-order receive data is too
1768 * complex for anyones sanity. So we don't do it anymore. But
1769 * if we are really having our buffer space abused we stop accepting
1770 * new receive data.
1772 if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
1773 return 0;
1775 /* Massive buffer overcommit. */
1776 return -1;
1780 * TCP receive function for the ESTABLISHED state.
1782 * It is split into a fast path and a slow path. The fast path is
1783 * disabled when:
1784 * - A zero window was announced from us - zero window probing
1785 * is only handled properly in the slow path.
1786 * - Out of order segments arrived.
1787 * - Urgent data is expected.
1788 * - There is no buffer space left
1789 * - Unexpected TCP flags/window values/header lengths are received
1790 * (detected by checking the TCP header against pred_flags)
1791 * - Data is sent in both directions. Fast path only supports pure senders
1792 * or pure receivers (this means either the sequence number or the ack
1793 * value must stay constant)
1795 * When these conditions are not satisfied it drops into a standard
1796 * receive procedure patterned after RFC793 to handle all cases.
1797 * The first three cases are guaranteed by proper pred_flags setting,
1798 * the rest is checked inline. Fast processing is turned on in
1799 * tcp_data_queue when everything is OK.
1801 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1802 struct tcphdr *th, unsigned len)
1804 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1805 int queued;
1806 u32 flg;
1809 * Header prediction.
1810 * The code follows the one in the famous
1811 * "30 instruction TCP receive" Van Jacobson mail.
1813 * Van's trick is to deposit buffers into socket queue
1814 * on a device interrupt, to call tcp_recv function
1815 * on the receive process context and checksum and copy
1816 * the buffer to user space. smart...
1818 * Our current scheme is not silly either but we take the
1819 * extra cost of the net_bh soft interrupt processing...
1820 * We do checksum and copy also but from device to kernel.
1824 * RFC1323: H1. Apply PAWS check first.
1826 if (tcp_fast_parse_options(sk, th, tp)) {
1827 if (tp->saw_tstamp) {
1828 if (tcp_paws_discard(tp, th, len)) {
1829 tcp_statistics.TcpInErrs++;
1830 if (!th->rst) {
1831 tcp_send_ack(sk);
1832 goto discard;
1835 tcp_replace_ts_recent(sk, tp,
1836 TCP_SKB_CB(skb)->seq,
1837 TCP_SKB_CB(skb)->end_seq);
1841 flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1843 /* pred_flags is 0xS?10 << 16 + snd_wnd
1844 * if header_predition is to be made
1845 * 'S' will always be tp->tcp_header_len >> 2
1846 * '?' will be 0 else it will be !0
1847 * (when there are holes in the receive
1848 * space for instance)
1849 * PSH flag is ignored.
1852 if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1853 if (len <= th->doff*4) {
1854 /* Bulk data transfer: sender */
1855 if (len == th->doff*4) {
1856 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1857 TCP_SKB_CB(skb)->ack_seq, len);
1858 kfree_skb(skb);
1859 tcp_data_snd_check(sk);
1860 return 0;
1861 } else { /* Header too small */
1862 tcp_statistics.TcpInErrs++;
1863 goto discard;
1865 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1866 atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1867 /* Bulk data transfer: receiver */
1868 __skb_pull(skb,th->doff*4);
1870 tcp_measure_rcv_mss(sk, skb);
1872 /* DO NOT notify forward progress here.
1873 * It saves dozen of CPU instructions in fast path. --ANK
1875 __skb_queue_tail(&sk->receive_queue, skb);
1876 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1878 /* FIN bit check is not done since if FIN is set in
1879 * this frame, the pred_flags won't match up. -DaveM
1881 sk->data_ready(sk, 0);
1882 tcp_delack_estimator(tp);
1884 tcp_remember_ack(tp, th, skb);
1886 __tcp_ack_snd_check(sk);
1887 return 0;
1892 * Standard slow path.
1895 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1896 /* RFC793, page 37: "In all states except SYN-SENT, all reset
1897 * (RST) segments are validated by checking their SEQ-fields."
1898 * And page 69: "If an incoming segment is not acceptable,
1899 * an acknowledgment should be sent in reply (unless the RST bit
1900 * is set, if so drop the segment and return)".
1902 if (th->rst)
1903 goto discard;
1904 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1905 SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1906 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1907 tp->rcv_wup, tp->rcv_wnd);
1909 tcp_send_ack(sk);
1910 goto discard;
1913 if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1914 SOCK_DEBUG(sk, "syn in established state\n");
1915 tcp_statistics.TcpInErrs++;
1916 tcp_reset(sk);
1917 return 1;
1920 if(th->rst) {
1921 tcp_reset(sk);
1922 goto discard;
1925 if(th->ack)
1926 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1928 /* Process urgent data. */
1929 tcp_urg(sk, th, len);
1931 /* step 7: process the segment text */
1932 queued = tcp_data(skb, sk, len);
1934 /* This must be after tcp_data() does the skb_pull() to
1935 * remove the header size from skb->len.
1937 * Dave!!! Phrase above (and all about rcv_mss) has
1938 * nothing to do with reality. rcv_mss must measure TOTAL
1939 * size, including sacks, IP options etc. Hence, measure_rcv_mss
1940 * must occure before pulling etc, otherwise it will flap
1941 * like hell. Even putting it before tcp_data is wrong,
1942 * it should use skb->tail - skb->nh.raw instead.
1943 * --ANK (980805)
1945 * BTW I broke it. Now all TCP options are handled equally
1946 * in mss_clamp calculations (i.e. ignored, rfc1122),
1947 * and mss_cache does include all of them (i.e. tstamps)
1948 * except for sacks, to calulate effective mss faster.
1949 * --ANK (980805)
1951 tcp_measure_rcv_mss(sk, skb);
1953 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1954 if(sk->state != TCP_CLOSE) {
1955 tcp_data_snd_check(sk);
1956 tcp_ack_snd_check(sk);
1959 if (!queued) {
1960 discard:
1961 kfree_skb(skb);
1964 return 0;
1968 * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1969 * as an open_request.
1972 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
1973 struct open_request *req)
1975 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1976 u32 flg;
1978 /* assumption: the socket is not in use.
1979 * as we checked the user count on tcp_rcv and we're
1980 * running from a soft interrupt.
1983 /* Check for syn retransmission */
1984 flg = *(((u32 *)skb->h.th) + 3);
1986 flg &= __constant_htonl(0x00170000);
1987 /* Only SYN set? */
1988 if (flg == __constant_htonl(0x00020000)) {
1989 if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1990 /* retransmited syn.
1992 req->class->rtx_syn_ack(sk, req);
1993 return NULL;
1994 } else {
1995 return sk; /* Pass new SYN to the listen socket. */
1999 /* We know it's an ACK here */
2000 if (req->sk) {
2001 /* socket already created but not
2002 * yet accepted()...
2004 sk = req->sk;
2005 } else {
2006 /* In theory the packet could be for a cookie, but
2007 * TIME_WAIT should guard us against this.
2008 * XXX: Nevertheless check for cookies?
2009 * This sequence number check is done again later,
2010 * but we do it here to prevent syn flood attackers
2011 * from creating big SYN_RECV sockets.
2013 if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
2014 !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
2015 req->rcv_isn+1+req->rcv_wnd)) {
2016 req->class->send_reset(skb);
2017 return NULL;
2020 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
2021 tcp_dec_slow_timer(TCP_SLT_SYNACK);
2022 if (sk == NULL)
2023 return NULL;
2025 req->expires = 0UL;
2026 req->sk = sk;
2028 skb_orphan(skb);
2029 skb_set_owner_r(skb, sk);
2030 return sk;
2034 * This function implements the receiving procedure of RFC 793 for
2035 * all states except ESTABLISHED and TIME_WAIT.
2036 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2037 * address independent.
2040 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2041 struct tcphdr *th, unsigned len)
2043 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2044 int queued = 0;
2046 switch (sk->state) {
2047 case TCP_CLOSE:
2048 /* When state == CLOSED, hash lookup always fails.
2050 * But, there is a back door, the backlog queue.
2051 * If we have a sequence of packets in the backlog
2052 * during __release_sock() which have a sequence such
2053 * that:
2054 * packet X causes entry to TCP_CLOSE state
2055 * ...
2056 * packet X + N has FIN bit set
2058 * We report a (luckily) harmless error in this case.
2059 * The issue is that backlog queue processing bypasses
2060 * any hash lookups (we know which socket packets are for).
2061 * The correct behavior here is what 2.0.x did, since
2062 * a TCP_CLOSE socket does not exist. Drop the frame
2063 * and send a RST back to the other end.
2065 return 1;
2067 case TCP_LISTEN:
2068 /* These use the socket TOS..
2069 * might want to be the received TOS
2071 if(th->ack)
2072 return 1;
2074 if(th->syn) {
2075 if(tp->af_specific->conn_request(sk, skb, 0) < 0)
2076 return 1;
2078 /* Now we have several options: In theory there is
2079 * nothing else in the frame. KA9Q has an option to
2080 * send data with the syn, BSD accepts data with the
2081 * syn up to the [to be] advertised window and
2082 * Solaris 2.1 gives you a protocol error. For now
2083 * we just ignore it, that fits the spec precisely
2084 * and avoids incompatibilities. It would be nice in
2085 * future to drop through and process the data.
2087 * Now that TTCP is starting to be used we ought to
2088 * queue this data.
2089 * But, this leaves one open to an easy denial of
2090 * service attack, and SYN cookies can't defend
2091 * against this problem. So, we drop the data
2092 * in the interest of security over speed.
2094 goto discard;
2097 goto discard;
2098 break;
2100 case TCP_SYN_SENT:
2101 /* SYN sent means we have to look for a suitable ack and
2102 * either reset for bad matches or go to connected.
2103 * The SYN_SENT case is unusual and should
2104 * not be in line code. [AC]
2106 if(th->ack) {
2107 /* rfc793:
2108 * "If the state is SYN-SENT then
2109 * first check the ACK bit
2110 * If the ACK bit is set
2111 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
2112 * a reset (unless the RST bit is set, if so drop
2113 * the segment and return)"
2115 * I cite this place to emphasize one essential
2116 * detail, this check is different of one
2117 * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
2118 * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
2119 * because we have no previous data sent before SYN.
2120 * --ANK(990513)
2122 * We do not send data with SYN, so that RFC-correct
2123 * test reduces to:
2125 if (sk->zapped ||
2126 TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
2127 return 1;
2129 /* Now ACK is acceptable.
2131 * "If the RST bit is set
2132 * If the ACK was acceptable then signal the user "error:
2133 * connection reset", drop the segment, enter CLOSED state,
2134 * delete TCB, and return."
2137 if (th->rst) {
2138 tcp_reset(sk);
2139 goto discard;
2142 /* rfc793:
2143 * "fifth, if neither of the SYN or RST bits is set then
2144 * drop the segment and return."
2146 * See note below!
2147 * --ANK(990513)
2150 if (!th->syn)
2151 goto discard;
2153 /* rfc793:
2154 * "If the SYN bit is on ...
2155 * are acceptable then ...
2156 * (our SYN has been ACKed), change the connection
2157 * state to ESTABLISHED..."
2159 * Do you see? SYN-less ACKs in SYN-SENT state are
2160 * completely ignored.
2162 * The bug causing stalled SYN-SENT sockets
2163 * was here: tcp_ack advanced snd_una and canceled
2164 * retransmit timer, so that bare ACK received
2165 * in SYN-SENT state (even with invalid ack==ISS,
2166 * because tcp_ack check is too weak for SYN-SENT)
2167 * causes moving socket to invalid semi-SYN-SENT,
2168 * semi-ESTABLISHED state and connection hangs.
2170 * There exist buggy stacks, which really send
2171 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
2172 * Actually, if this host did not try to get something
2173 * from ftp.inr.ac.ru I'd never find this bug 8)
2175 * --ANK (990514)
2178 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2179 tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2180 TCP_SKB_CB(skb)->ack_seq, len);
2182 /* Ok.. it's good. Set up sequence numbers and
2183 * move to established.
2185 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2186 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2188 /* RFC1323: The window in SYN & SYN/ACK segments is
2189 * never scaled.
2191 tp->snd_wnd = htons(th->window);
2192 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2193 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2194 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2196 tcp_set_state(sk, TCP_ESTABLISHED);
2197 tcp_parse_options(sk, th, tp, 0);
2199 if (tp->wscale_ok == 0) {
2200 tp->snd_wscale = tp->rcv_wscale = 0;
2201 tp->window_clamp = min(tp->window_clamp,65535);
2204 if (tp->tstamp_ok) {
2205 tp->tcp_header_len =
2206 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2207 } else
2208 tp->tcp_header_len = sizeof(struct tcphdr);
2209 if (tp->saw_tstamp) {
2210 tp->ts_recent = tp->rcv_tsval;
2211 tp->ts_recent_stamp = tcp_time_stamp;
2214 /* Can't be earlier, doff would be wrong. */
2215 tcp_send_ack(sk);
2217 sk->dport = th->source;
2218 tp->copied_seq = tp->rcv_nxt;
2220 if(!sk->dead) {
2221 sk->state_change(sk);
2222 sock_wake_async(sk->socket, 0);
2224 } else {
2225 if(th->syn && !th->rst) {
2226 /* The previous version of the code
2227 * checked for "connecting to self"
2228 * here. that check is done now in
2229 * tcp_connect.
2231 tcp_set_state(sk, TCP_SYN_RECV);
2232 tcp_parse_options(sk, th, tp, 0);
2233 if (tp->saw_tstamp) {
2234 tp->ts_recent = tp->rcv_tsval;
2235 tp->ts_recent_stamp = tcp_time_stamp;
2238 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2239 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2241 /* RFC1323: The window in SYN & SYN/ACK segments is
2242 * never scaled.
2244 tp->snd_wnd = htons(th->window);
2245 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2247 tcp_send_synack(sk);
2248 } else
2249 break;
2252 /* tp->tcp_header_len and tp->mss_clamp
2253 probably changed, synchronize mss.
2255 tcp_sync_mss(sk, tp->pmtu_cookie);
2256 tp->rcv_mss = tp->mss_cache;
2258 if (sk->state == TCP_SYN_RECV)
2259 goto discard;
2261 goto step6;
2264 /* Parse the tcp_options present on this header.
2265 * By this point we really only expect timestamps.
2266 * Note that this really has to be here and not later for PAWS
2267 * (RFC1323) to work.
2269 if (tcp_fast_parse_options(sk, th, tp)) {
2270 /* NOTE: assumes saw_tstamp is never set if we didn't
2271 * negotiate the option. tcp_fast_parse_options() must
2272 * guarantee this.
2274 if (tp->saw_tstamp) {
2275 if (tcp_paws_discard(tp, th, len)) {
2276 tcp_statistics.TcpInErrs++;
2277 if (!th->rst) {
2278 tcp_send_ack(sk);
2279 goto discard;
2282 tcp_replace_ts_recent(sk, tp,
2283 TCP_SKB_CB(skb)->seq,
2284 TCP_SKB_CB(skb)->end_seq);
2288 /* The silly FIN test here is necessary to see an advancing ACK in
2289 * retransmitted FIN frames properly. Consider the following sequence:
2291 * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
2292 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
2293 * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
2294 * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
2296 * At this point the connection will deadlock with host1 believing
2297 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
2298 * forever. The following fix is from Taral (taral@taral.net).
2301 /* step 1: check sequence number */
2302 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
2303 !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
2304 if (!th->rst) {
2305 tcp_send_ack(sk);
2306 goto discard;
2310 /* step 2: check RST bit */
2311 if(th->rst) {
2312 tcp_reset(sk);
2313 goto discard;
2316 /* step 3: check security and precedence [ignored] */
2318 /* step 4:
2320 * Check for a SYN, and ensure it matches the SYN we were
2321 * first sent. We have to handle the rather unusual (but valid)
2322 * sequence that KA9Q derived products may generate of
2324 * SYN
2325 * SYN|ACK Data
2326 * ACK (lost)
2327 * SYN|ACK Data + More Data
2328 * .. we must ACK not RST...
2330 * We keep syn_seq as the sequence space occupied by the
2331 * original syn.
2334 if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2335 tcp_reset(sk);
2336 return 1;
2339 /* step 5: check the ACK field */
2340 if (th->ack) {
2341 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2342 TCP_SKB_CB(skb)->ack_seq, len);
2344 switch(sk->state) {
2345 case TCP_SYN_RECV:
2346 if (acceptable) {
2347 tcp_set_state(sk, TCP_ESTABLISHED);
2348 sk->dport = th->source;
2349 tp->copied_seq = tp->rcv_nxt;
2351 if(!sk->dead)
2352 sk->state_change(sk);
2354 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2355 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2356 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2357 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2359 } else {
2360 SOCK_DEBUG(sk, "bad ack\n");
2361 return 1;
2363 break;
2365 case TCP_FIN_WAIT1:
2366 if (tp->snd_una == tp->write_seq) {
2367 sk->shutdown |= SEND_SHUTDOWN;
2368 tcp_set_state(sk, TCP_FIN_WAIT2);
2369 if (!sk->dead)
2370 sk->state_change(sk);
2371 else
2372 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2374 break;
2376 case TCP_CLOSING:
2377 if (tp->snd_una == tp->write_seq) {
2378 tcp_time_wait(sk);
2379 goto discard;
2381 break;
2383 case TCP_LAST_ACK:
2384 if (tp->snd_una == tp->write_seq) {
2385 sk->shutdown = SHUTDOWN_MASK;
2386 tcp_set_state(sk,TCP_CLOSE);
2387 if (!sk->dead)
2388 sk->state_change(sk);
2389 goto discard;
2391 break;
2393 } else
2394 goto discard;
2396 step6:
2397 /* step 6: check the URG bit */
2398 tcp_urg(sk, th, len);
2400 /* step 7: process the segment text */
2401 switch (sk->state) {
2402 case TCP_CLOSE_WAIT:
2403 case TCP_CLOSING:
2404 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2405 break;
2407 case TCP_FIN_WAIT1:
2408 case TCP_FIN_WAIT2:
2409 /* RFC 793 says to queue data in these states,
2410 * RFC 1122 says we MUST send a reset.
2411 * BSD 4.4 also does reset.
2413 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2414 if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2415 tcp_reset(sk);
2416 return 1;
2420 case TCP_ESTABLISHED:
2421 queued = tcp_data(skb, sk, len);
2423 /* This must be after tcp_data() does the skb_pull() to
2424 * remove the header size from skb->len.
2426 tcp_measure_rcv_mss(sk, skb);
2427 break;
2430 tcp_data_snd_check(sk);
2431 tcp_ack_snd_check(sk);
2433 if (!queued) {
2434 discard:
2435 kfree_skb(skb);
2437 return 0;