Linux 2.1.127
[davej-history.git] / net / ipv4 / tcp_input.c
blobcfa0f74b7670bdbc9d4b7f717b40d3893d9683ac
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_input.c,v 1.136 1998/11/07 14:36:18 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
60 #include <linux/config.h>
61 #include <linux/mm.h>
62 #include <linux/sysctl.h>
63 #include <net/tcp.h>
64 #include <linux/ipsec.h>
66 #ifdef CONFIG_SYSCTL
67 #define SYNC_INIT 0 /* let the user enable it */
68 #else
69 #define SYNC_INIT 1
70 #endif
72 extern int sysctl_tcp_fin_timeout;
74 /* These are on by default so the code paths get tested.
75 * For the final 2.2 this may be undone at our discretion. -DaveM
77 int sysctl_tcp_timestamps = 1;
78 int sysctl_tcp_window_scaling = 1;
79 int sysctl_tcp_sack = 1;
81 int sysctl_tcp_syncookies = SYNC_INIT;
82 int sysctl_tcp_stdurg;
83 int sysctl_tcp_rfc1337;
85 static int prune_queue(struct sock *sk);
87 /* There is something which you must keep in mind when you analyze the
88 * behavior of the tp->ato delayed ack timeout interval. When a
89 * connection starts up, we want to ack as quickly as possible. The
90 * problem is that "good" TCP's do slow start at the beginning of data
91 * transmission. The means that until we send the first few ACK's the
92 * sender will sit on his end and only queue most of his data, because
93 * he can only send snd_cwnd unacked packets at any given time. For
94 * each ACK we send, he increments snd_cwnd and transmits more of his
95 * queue. -DaveM
97 static void tcp_delack_estimator(struct tcp_opt *tp)
99 if(tp->ato == 0) {
100 tp->lrcvtime = jiffies;
102 /* Help sender leave slow start quickly,
103 * this sets our initial ato value.
105 tcp_enter_quickack_mode(tp);
106 } else {
107 int m = jiffies - tp->lrcvtime;
109 tp->lrcvtime = jiffies;
110 if(m <= 0)
111 m = 1;
112 if(m > tp->rto)
113 tp->ato = tp->rto;
114 else
115 tp->ato = (tp->ato >> 1) + m;
117 /* We are not in "quick ack" mode. */
118 if(tp->ato <= (HZ/100))
119 tp->ato = ((HZ/100)*2);
124 * Remember to send an ACK later.
126 static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
127 struct sk_buff *skb)
129 tp->delayed_acks++;
130 /* Tiny-grams with PSH set make us ACK quickly. */
131 if(th->psh && (skb->len < (tp->mss_cache >> 1)))
132 tp->ato = HZ/50;
135 /* Called to compute a smoothed rtt estimate. The data fed to this
136 * routine either comes from timestamps, or from segments that were
137 * known _not_ to have been retransmitted [see Karn/Partridge
138 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
139 * piece by Van Jacobson.
140 * NOTE: the next three routines used to be one big routine.
141 * To save cycles in the RFC 1323 implementation it was better to break
142 * it up into three procedures. -- erics
145 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
147 long m = mrtt; /* RTT */
149 /* The following amusing code comes from Jacobson's
150 * article in SIGCOMM '88. Note that rtt and mdev
151 * are scaled versions of rtt and mean deviation.
152 * This is designed to be as fast as possible
153 * m stands for "measurement".
155 * On a 1990 paper the rto value is changed to:
156 * RTO = rtt + 4 * mdev
158 if(m == 0)
159 m = 1;
160 if (tp->srtt != 0) {
161 m -= (tp->srtt >> 3); /* m is now error in rtt est */
162 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
163 if (m < 0)
164 m = -m; /* m is now abs(error) */
165 m -= (tp->mdev >> 2); /* similar update on mdev */
166 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
167 } else {
168 /* no previous measure. */
169 tp->srtt = m<<3; /* take the measured time to be rtt */
170 tp->mdev = m<<2; /* make sure rto = 3*rtt */
174 /* Calculate rto without backoff. This is the second half of Van Jacobson's
175 * routine referred to above.
178 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
180 tp->rto = (tp->srtt >> 3) + tp->mdev;
181 tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
185 /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
186 * on packet lifetime in the internet. We need the HZ/5 lower
187 * bound to behave correctly against BSD stacks with a fixed
188 * delayed ack.
189 * FIXME: It's not entirely clear this lower bound is the best
190 * way to avoid the problem. Is it possible to drop the lower
191 * bound and still avoid trouble with BSD stacks? Perhaps
192 * some modification to the RTO calculation that takes delayed
193 * ack bias into account? This needs serious thought. -- erics
195 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
197 if (tp->rto > 120*HZ)
198 tp->rto = 120*HZ;
199 if (tp->rto < HZ/5)
200 tp->rto = HZ/5;
203 /* WARNING: this must not be called if tp->saw_timestamp was false. */
204 extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
205 __u32 start_seq, __u32 end_seq)
207 /* From draft-ietf-tcplw-high-performance: the correct
208 * test is last_ack_sent <= end_seq.
209 * (RFC1323 stated last_ack_sent < end_seq.)
211 * HOWEVER: The current check contradicts the draft statements.
212 * It has been done for good reasons.
213 * The implemented check improves security and eliminates
214 * unnecessary RTT overestimation.
215 * 1998/06/27 Andrey V. Savochkin <saw@msu.ru>
217 if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
218 !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
219 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
220 * extra check below makes sure this can only happen
221 * for pure ACK frames. -DaveM
223 if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
224 tp->ts_recent = tp->rcv_tsval;
225 tp->ts_recent_stamp = jiffies;
230 #define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
232 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
234 /* ts_recent must be younger than 24 days */
235 return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
236 (((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
237 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
238 (len != (th->doff * 4))));
242 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
244 u32 end_window = tp->rcv_wup + tp->rcv_wnd;
246 if (tp->rcv_wnd &&
247 after(end_seq, tp->rcv_nxt) &&
248 before(seq, end_window))
249 return 1;
250 if (seq != end_window)
251 return 0;
252 return (seq == end_seq);
255 /* This functions checks to see if the tcp header is actually acceptable. */
256 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
258 if (seq == tp->rcv_nxt)
259 return (tp->rcv_wnd || (end_seq == seq));
261 return __tcp_sequence(tp, seq, end_seq);
264 /* When we get a reset we do this. */
265 static void tcp_reset(struct sock *sk, struct sk_buff *skb)
267 sk->zapped = 1;
269 /* We want the right error as BSD sees it (and indeed as we do). */
270 switch (sk->state) {
271 case TCP_SYN_SENT:
272 sk->err = ECONNREFUSED;
273 break;
274 case TCP_CLOSE_WAIT:
275 sk->err = EPIPE;
276 break;
277 default:
278 sk->err = ECONNRESET;
280 tcp_set_state(sk,TCP_CLOSE);
281 sk->shutdown = SHUTDOWN_MASK;
282 if (!sk->dead)
283 sk->state_change(sk);
286 /* This tags the retransmission queue when SACKs arrive. */
287 static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
289 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
290 int i = nsacks;
292 while(i--) {
293 struct sk_buff *skb = skb_peek(&sk->write_queue);
294 __u32 start_seq = ntohl(sp->start_seq);
295 __u32 end_seq = ntohl(sp->end_seq);
296 int fack_count = 0;
298 while((skb != NULL) &&
299 (skb != tp->send_head) &&
300 (skb != (struct sk_buff *)&sk->write_queue)) {
301 /* The retransmission queue is always in order, so
302 * we can short-circuit the walk early.
304 if(!before(start_seq, TCP_SKB_CB(skb)->end_seq))
305 break;
307 /* We play conservative, we don't allow SACKS to partially
308 * tag a sequence space.
310 fack_count++;
311 if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
312 !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
313 /* If this was a retransmitted frame, account for it. */
314 if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
315 tp->retrans_out--;
316 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
318 /* RULE: All new SACKs will either decrease retrans_out
319 * or advance fackets_out.
321 if(fack_count > tp->fackets_out)
322 tp->fackets_out = fack_count;
324 skb = skb->next;
326 sp++; /* Move on to the next SACK block. */
330 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
331 * But, this can also be called on packets in the established flow when
332 * the fast version below fails.
334 void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
336 unsigned char *ptr;
337 int length=(th->doff*4)-sizeof(struct tcphdr);
339 ptr = (unsigned char *)(th + 1);
340 tp->saw_tstamp = 0;
342 while(length>0) {
343 int opcode=*ptr++;
344 int opsize;
346 switch (opcode) {
347 case TCPOPT_EOL:
348 return;
349 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
350 length--;
351 continue;
352 default:
353 opsize=*ptr++;
354 if (opsize < 2) /* "silly options" */
355 return;
356 if (opsize > length)
357 break; /* don't parse partial options */
358 switch(opcode) {
359 case TCPOPT_MSS:
360 if(opsize==TCPOLEN_MSS && th->syn) {
361 u16 in_mss = ntohs(*(__u16 *)ptr);
362 if (in_mss == 0)
363 in_mss = 536;
364 if (tp->mss_clamp > in_mss)
365 tp->mss_clamp = in_mss;
367 break;
368 case TCPOPT_WINDOW:
369 if(opsize==TCPOLEN_WINDOW && th->syn)
370 if (!no_fancy && sysctl_tcp_window_scaling) {
371 tp->wscale_ok = 1;
372 tp->snd_wscale = *(__u8 *)ptr;
373 if(tp->snd_wscale > 14) {
374 if(net_ratelimit())
375 printk("tcp_parse_options: Illegal window "
376 "scaling value %d >14 received.",
377 tp->snd_wscale);
378 tp->snd_wscale = 14;
381 break;
382 case TCPOPT_TIMESTAMP:
383 if(opsize==TCPOLEN_TIMESTAMP) {
384 if (sysctl_tcp_timestamps && !no_fancy) {
385 tp->tstamp_ok = 1;
386 tp->saw_tstamp = 1;
387 tp->rcv_tsval = ntohl(*(__u32 *)ptr);
388 tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
391 break;
392 case TCPOPT_SACK_PERM:
393 if(opsize==TCPOLEN_SACK_PERM && th->syn) {
394 if (sysctl_tcp_sack && !no_fancy) {
395 tp->sack_ok = 1;
396 tp->num_sacks = 0;
399 break;
401 case TCPOPT_SACK:
402 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
403 sysctl_tcp_sack && (sk != NULL) && !th->syn) {
404 int sack_bytes = opsize - TCPOLEN_SACK_BASE;
406 if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
407 int num_sacks = sack_bytes >> 3;
408 struct tcp_sack_block *sackp;
410 sackp = (struct tcp_sack_block *)ptr;
411 tcp_sacktag_write_queue(sk, sackp, num_sacks);
415 ptr+=opsize-2;
416 length-=opsize;
421 /* Fast parse options. This hopes to only see timestamps.
422 * If it is wrong it falls back on tcp_parse_options().
424 static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
426 /* If we didn't send out any options ignore them all. */
427 if (tp->tcp_header_len == sizeof(struct tcphdr))
428 return 0;
429 if (th->doff == sizeof(struct tcphdr)>>2) {
430 tp->saw_tstamp = 0;
431 return 0;
432 } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
433 __u32 *ptr = (__u32 *)(th + 1);
434 if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
435 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
436 tp->saw_tstamp = 1;
437 tp->rcv_tsval = ntohl(*++ptr);
438 tp->rcv_tsecr = ntohl(*++ptr);
439 return 1;
442 tcp_parse_options(sk, th, tp, 0);
443 return 1;
446 #define FLAG_DATA 0x01 /* Incoming frame contained data. */
447 #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
448 #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
449 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
451 static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
453 if (tp->dup_acks > 3)
454 tp->snd_cwnd = (tp->snd_ssthresh);
456 tp->dup_acks = 0;
459 /* NOTE: This code assumes that tp->dup_acks gets cleared when a
460 * retransmit timer fires.
462 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
464 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
466 /* Note: If not_dup is set this implies we got a
467 * data carrying packet or a window update.
468 * This carries no new information about possible
469 * lost packets, so we have to ignore it for the purposes
470 * of counting duplicate acks. Ideally this does not imply we
471 * should stop our fast retransmit phase, more acks may come
472 * later without data to help us. Unfortunately this would make
473 * the code below much more complex. For now if I see such
474 * a packet I clear the fast retransmit phase.
476 if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
477 /* This is the standard reno style fast retransmit branch. */
479 /* 1. When the third duplicate ack is received, set ssthresh
480 * to one half the current congestion window, but no less
481 * than two segments. Retransmit the missing segment.
483 if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
484 tp->dup_acks++;
485 if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
486 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
487 tp->snd_cwnd = (tp->snd_ssthresh + 3);
488 tp->high_seq = tp->snd_nxt;
489 if(!tp->fackets_out)
490 tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
491 else
492 tcp_fack_retransmit(sk);
493 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
497 /* 2. Each time another duplicate ACK arrives, increment
498 * cwnd by the segment size. [...] Transmit a packet...
500 * Packet transmission will be done on normal flow processing
501 * since we're not in "retransmit mode". We do not use duplicate
502 * ACKs to artificially inflate the congestion window when
503 * doing FACK.
505 if (tp->dup_acks > 3) {
506 if(!tp->fackets_out) {
507 tp->snd_cwnd++;
508 } else {
509 /* Fill any further holes which may have appeared.
510 * We may want to change this to run every further
511 * multiple-of-3 dup ack increments, to be more robust
512 * against out-of-order packet delivery. -DaveM
514 tcp_fack_retransmit(sk);
517 } else if (tp->high_seq != 0) {
518 /* In this branch we deal with clearing the Floyd style
519 * block on duplicate fast retransmits, and if requested
520 * we do Hoe style secondary fast retransmits.
522 if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
523 /* Once we have acked all the packets up to high_seq
524 * we are done this fast retransmit phase.
525 * Alternatively data arrived. In this case we
526 * Have to abort the fast retransmit attempt.
527 * Note that we do want to accept a window
528 * update since this is expected with Hoe's algorithm.
530 clear_fast_retransmit(tp);
532 /* After we have cleared up to high_seq we can
533 * clear the Floyd style block.
535 if (!before(ack, tp->high_seq)) {
536 tp->high_seq = 0;
537 tp->fackets_out = 0;
539 } else if (tp->dup_acks >= 3) {
540 if (!tp->fackets_out) {
541 /* Hoe Style. We didn't ack the whole
542 * window. Take this as a cue that
543 * another packet was lost and retransmit it.
544 * Don't muck with the congestion window here.
545 * Note that we have to be careful not to
546 * act if this was a window update and it
547 * didn't ack new data, since this does
548 * not indicate a packet left the system.
549 * We can test this by just checking
550 * if ack changed from snd_una, since
551 * the only way to get here without advancing
552 * from snd_una is if this was a window update.
554 if (ack != tp->snd_una && before(ack, tp->high_seq)) {
555 tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
556 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
558 } else {
559 /* FACK style, fill any remaining holes in
560 * receiver's queue.
562 tcp_fack_retransmit(sk);
568 /* This is Jacobson's slow start and congestion avoidance.
569 * SIGCOMM '88, p. 328.
571 static void tcp_cong_avoid(struct tcp_opt *tp)
573 if (tp->snd_cwnd <= tp->snd_ssthresh) {
574 /* In "safe" area, increase. */
575 tp->snd_cwnd++;
576 } else {
577 /* In dangerous area, increase slowly.
578 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
580 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
581 tp->snd_cwnd++;
582 tp->snd_cwnd_cnt=0;
583 } else
584 tp->snd_cwnd_cnt++;
588 /* Remove acknowledged frames from the retransmission queue. */
589 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
590 __u32 *seq, __u32 *seq_rtt)
592 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
593 struct sk_buff *skb;
594 unsigned long now = jiffies;
595 int acked = 0;
597 while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
598 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
599 __u8 sacked = scb->sacked;
601 /* If our packet is before the ack sequence we can
602 * discard it as it's confirmed to have arrived at
603 * the other end.
605 if (after(scb->end_seq, ack))
606 break;
608 /* Initial outgoing SYN's get put onto the write_queue
609 * just like anything else we transmit. It is not
610 * true data, and if we misinform our callers that
611 * this ACK acks real data, we will erroneously exit
612 * connection startup slow start one packet too
613 * quickly. This is severely frowned upon behavior.
615 if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
616 tp->retrans_out--;
617 if(!(scb->flags & TCPCB_FLAG_SYN)) {
618 acked |= FLAG_DATA_ACKED;
619 if(sacked & TCPCB_SACKED_RETRANS)
620 acked |= FLAG_RETRANS_DATA_ACKED;
621 if(tp->fackets_out)
622 tp->fackets_out--;
623 } else {
624 tp->retrans_head = NULL;
626 tp->packets_out--;
627 *seq = scb->seq;
628 *seq_rtt = now - scb->when;
629 __skb_unlink(skb, skb->list);
630 kfree_skb(skb);
633 if (acked)
634 tp->retrans_head = NULL;
635 return acked;
638 static void tcp_ack_probe(struct sock *sk, __u32 ack)
640 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
642 /* Our probe was answered. */
643 tp->probes_out = 0;
645 /* Was it a usable window open? */
647 /* should always be non-null */
648 if (tp->send_head != NULL &&
649 !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
650 tp->backoff = 0;
651 tp->pending = 0;
652 tcp_clear_xmit_timer(sk, TIME_PROBE0);
653 } else {
654 tcp_reset_xmit_timer(sk, TIME_PROBE0,
655 min(tp->rto << tp->backoff, 120*HZ));
659 /* Read draft-ietf-tcplw-high-performance before mucking
660 * with this code. (Superceeds RFC1323)
662 static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
663 u32 seq, u32 ack, int flag)
665 __u32 seq_rtt;
667 /* RTTM Rule: A TSecr value received in a segment is used to
668 * update the averaged RTT measurement only if the segment
669 * acknowledges some new data, i.e., only if it advances the
670 * left edge of the send window.
672 * See draft-ietf-tcplw-high-performance-00, section 3.3.
673 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
675 if (!(flag & FLAG_DATA_ACKED))
676 return;
678 seq_rtt = jiffies-tp->rcv_tsecr;
679 tcp_rtt_estimator(tp, seq_rtt);
680 if (tp->retransmits) {
681 if (tp->packets_out == 0) {
682 tp->retransmits = 0;
683 tp->fackets_out = 0;
684 tp->retrans_out = 0;
685 tp->backoff = 0;
686 tcp_set_rto(tp);
687 } else {
688 /* Still retransmitting, use backoff */
689 tcp_set_rto(tp);
690 tp->rto = tp->rto << tp->backoff;
692 } else {
693 tcp_set_rto(tp);
694 tcp_cong_avoid(tp);
696 /* NOTE: safe here so long as cong_ctl doesn't use rto */
697 tcp_bound_rto(tp);
700 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
702 struct sk_buff *skb = skb_peek(&sk->write_queue);
703 long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
705 /* Some data was ACK'd, if still retransmitting (due to a
706 * timeout), resend more of the retransmit queue. The
707 * congestion window is handled properly by that code.
709 if (tp->retransmits) {
710 tp->retrans_head = NULL;
711 tcp_xmit_retransmit_queue(sk);
712 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
713 } else {
714 tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
718 /* This routine deals with incoming acks, but not outgoing ones. */
719 static int tcp_ack(struct sock *sk, struct tcphdr *th,
720 u32 ack_seq, u32 ack, int len)
722 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
723 int flag = 0;
724 u32 seq = 0;
725 u32 seq_rtt = 0;
727 if(sk->zapped)
728 return(1); /* Dead, can't ack any more so why bother */
730 if (tp->pending == TIME_KEEPOPEN)
731 tp->probes_out = 0;
733 tp->rcv_tstamp = jiffies;
735 /* If the ack is newer than sent or older than previous acks
736 * then we can probably ignore it.
738 if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
739 goto uninteresting_ack;
741 dst_confirm(sk->dst_cache);
743 /* If there is data set flag 1 */
744 if (len != th->doff*4) {
745 flag |= FLAG_DATA;
746 tcp_delack_estimator(tp);
749 /* Update our send window. */
751 /* This is the window update code as per RFC 793
752 * snd_wl{1,2} are used to prevent unordered
753 * segments from shrinking the window
755 if (before(tp->snd_wl1, ack_seq) ||
756 (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
757 u32 nwin = ntohs(th->window) << tp->snd_wscale;
759 if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
760 flag |= FLAG_WIN_UPDATE;
761 tp->snd_wnd = nwin;
763 tp->snd_wl1 = ack_seq;
764 tp->snd_wl2 = ack;
766 if (nwin > tp->max_window)
767 tp->max_window = nwin;
771 /* We passed data and got it acked, remove any soft error
772 * log. Something worked...
774 sk->err_soft = 0;
776 /* If this ack opens up a zero window, clear backoff. It was
777 * being used to time the probes, and is probably far higher than
778 * it needs to be for normal retransmission.
780 if (tp->pending == TIME_PROBE0)
781 tcp_ack_probe(sk, ack);
783 /* See if we can take anything off of the retransmit queue. */
784 flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
786 /* If we have a timestamp, we always do rtt estimates. */
787 if (tp->saw_tstamp) {
788 tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
789 } else {
790 /* If we were retransmiting don't count rtt estimate. */
791 if (tp->retransmits) {
792 if (tp->packets_out == 0) {
793 tp->retransmits = 0;
794 tp->fackets_out = 0;
795 tp->retrans_out = 0;
797 } else {
798 /* We don't have a timestamp. Can only use
799 * packets that are not retransmitted to determine
800 * rtt estimates. Also, we must not reset the
801 * backoff for rto until we get a non-retransmitted
802 * packet. This allows us to deal with a situation
803 * where the network delay has increased suddenly.
804 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
806 if (flag & FLAG_DATA_ACKED) {
807 if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
808 tp->backoff = 0;
809 tcp_rtt_estimator(tp, seq_rtt);
810 tcp_set_rto(tp);
811 tcp_bound_rto(tp);
813 tcp_cong_avoid(tp);
818 if (tp->packets_out) {
819 if (flag & FLAG_DATA_ACKED)
820 tcp_ack_packets_out(sk, tp);
821 } else {
822 tcp_clear_xmit_timer(sk, TIME_RETRANS);
825 flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
826 if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
827 (tp->high_seq != 0)) {
828 tcp_fast_retrans(sk, ack, flag);
829 } else {
830 /* Clear any aborted fast retransmit starts. */
831 tp->dup_acks = 0;
833 /* Remember the highest ack received. */
834 tp->snd_una = ack;
835 return 1;
837 uninteresting_ack:
838 SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
839 return 0;
842 /* New-style handling of TIME_WAIT sockets. */
843 extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
844 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
845 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
847 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
849 /* Unlink from various places. */
850 if(tw->bind_next)
851 tw->bind_next->bind_pprev = tw->bind_pprev;
852 *(tw->bind_pprev) = tw->bind_next;
853 if(tw->tb->owners == NULL)
854 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
856 if(tw->next)
857 tw->next->pprev = tw->pprev;
858 *tw->pprev = tw->next;
860 /* We decremented the prot->inuse count when we entered TIME_WAIT
861 * and the sock from which this came was destroyed.
863 tw->sklist_next->sklist_prev = tw->sklist_prev;
864 tw->sklist_prev->sklist_next = tw->sklist_next;
866 /* Ok, now free it up. */
867 kmem_cache_free(tcp_timewait_cachep, tw);
870 /* We come here as a special case from the AF specific TCP input processing,
871 * and the SKB has no owner. Essentially handling this is very simple,
872 * we just keep silently eating rx'd packets until none show up for the
873 * entire timeout period. The only special cases are for BSD TIME_WAIT
874 * reconnects and SYN/RST bits being set in the TCP header.
876 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
877 struct tcphdr *th, unsigned len)
879 /* RFC 1122:
880 * "When a connection is [...] on TIME-WAIT state [...]
881 * [a TCP] MAY accept a new SYN from the remote TCP to
882 * reopen the connection directly, if it:
884 * (1) assigns its initial sequence number for the new
885 * connection to be larger than the largest sequence
886 * number it used on the previous connection incarnation,
887 * and
889 * (2) returns to TIME-WAIT state if the SYN turns out
890 * to be an old duplicate".
892 if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
893 struct sock *sk;
894 struct tcp_func *af_specific = tw->af_specific;
895 __u32 isn;
897 isn = tw->rcv_nxt + 128000;
898 if(isn == 0)
899 isn++;
900 tcp_tw_deschedule(tw);
901 tcp_timewait_kill(tw);
902 sk = af_specific->get_sock(skb, th);
903 if(sk == NULL || !ipsec_sk_policy(sk,skb))
904 return 0;
905 skb_set_owner_r(skb, sk);
906 af_specific = sk->tp_pinfo.af_tcp.af_specific;
907 if(af_specific->conn_request(sk, skb, isn) < 0)
908 return 1; /* Toss a reset back. */
909 return 0; /* Discard the frame. */
912 /* Check RST or SYN */
913 if(th->rst || th->syn) {
914 /* This is TIME_WAIT assasination, in two flavors.
915 * Oh well... nobody has a sufficient solution to this
916 * protocol bug yet.
918 if(sysctl_tcp_rfc1337 == 0) {
919 tcp_tw_deschedule(tw);
920 tcp_timewait_kill(tw);
922 if(!th->rst)
923 return 1; /* toss a reset back */
924 } else {
925 /* In this case we must reset the TIMEWAIT timer. */
926 if(th->ack)
927 tcp_tw_reschedule(tw);
929 return 0; /* Discard the frame. */
932 /* Enter the time wait state. This is always called from BH
933 * context. Essentially we whip up a timewait bucket, copy the
934 * relevant info into it from the SK, and mess with hash chains
935 * and list linkage.
937 static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
939 struct sock **head, *sktw;
941 /* Step 1: Remove SK from established hash. */
942 if(sk->next)
943 sk->next->pprev = sk->pprev;
944 *sk->pprev = sk->next;
945 sk->pprev = NULL;
946 tcp_reg_zap(sk);
948 /* Step 2: Put TW into bind hash where SK was. */
949 tw->tb = (struct tcp_bind_bucket *)sk->prev;
950 if((tw->bind_next = sk->bind_next) != NULL)
951 sk->bind_next->bind_pprev = &tw->bind_next;
952 tw->bind_pprev = sk->bind_pprev;
953 *sk->bind_pprev = (struct sock *)tw;
955 /* Step 3: Same for the protocol sklist. */
956 (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
957 (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
958 sk->sklist_next = NULL;
959 sk->prot->inuse--;
961 /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
962 head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
963 sktw = (struct sock *)tw;
964 if((sktw->next = *head) != NULL)
965 (*head)->pprev = &sktw->next;
966 *head = sktw;
967 sktw->pprev = head;
970 void tcp_time_wait(struct sock *sk)
972 struct tcp_tw_bucket *tw;
974 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
975 if(tw != NULL) {
976 /* Give us an identity. */
977 tw->daddr = sk->daddr;
978 tw->rcv_saddr = sk->rcv_saddr;
979 tw->bound_dev_if= sk->bound_dev_if;
980 tw->num = sk->num;
981 tw->state = TCP_TIME_WAIT;
982 tw->sport = sk->sport;
983 tw->dport = sk->dport;
984 tw->family = sk->family;
985 tw->reuse = sk->reuse;
986 tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
987 tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
989 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
990 if(tw->family == PF_INET6) {
991 memcpy(&tw->v6_daddr,
992 &sk->net_pinfo.af_inet6.daddr,
993 sizeof(struct in6_addr));
994 memcpy(&tw->v6_rcv_saddr,
995 &sk->net_pinfo.af_inet6.rcv_saddr,
996 sizeof(struct in6_addr));
998 #endif
999 /* Linkage updates. */
1000 tcp_tw_hashdance(sk, tw);
1002 /* Get the TIME_WAIT timeout firing. */
1003 tcp_tw_schedule(tw);
1005 /* CLOSE the SK. */
1006 if(sk->state == TCP_ESTABLISHED)
1007 tcp_statistics.TcpCurrEstab--;
1008 sk->state = TCP_CLOSE;
1009 net_reset_timer(sk, TIME_DONE,
1010 min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1011 } else {
1012 /* Sorry, we're out of memory, just CLOSE this
1013 * socket up. We've got bigger problems than
1014 * non-graceful socket closings.
1016 tcp_set_state(sk, TCP_CLOSE);
1019 /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1020 sk->shutdown = SHUTDOWN_MASK;
1021 if(!sk->dead)
1022 sk->state_change(sk);
1026 * Process the FIN bit. This now behaves as it is supposed to work
1027 * and the FIN takes effect when it is validly part of sequence
1028 * space. Not before when we get holes.
1030 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1031 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1032 * TIME-WAIT)
1034 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1035 * close and we go into CLOSING (and later onto TIME-WAIT)
1037 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1040 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1042 sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1044 tcp_send_ack(sk);
1046 if (!sk->dead) {
1047 sk->state_change(sk);
1048 sock_wake_async(sk->socket, 1);
1051 switch(sk->state) {
1052 case TCP_SYN_RECV:
1053 case TCP_ESTABLISHED:
1054 /* Move to CLOSE_WAIT */
1055 tcp_set_state(sk, TCP_CLOSE_WAIT);
1056 if (th->rst)
1057 sk->shutdown = SHUTDOWN_MASK;
1058 break;
1060 case TCP_CLOSE_WAIT:
1061 case TCP_CLOSING:
1062 /* Received a retransmission of the FIN, do
1063 * nothing.
1065 break;
1066 case TCP_LAST_ACK:
1067 /* RFC793: Remain in the LAST-ACK state. */
1068 break;
1070 case TCP_FIN_WAIT1:
1071 /* This case occurs when a simultaneous close
1072 * happens, we must ack the received FIN and
1073 * enter the CLOSING state.
1075 * This causes a WRITE timeout, which will either
1076 * move on to TIME_WAIT when we timeout, or resend
1077 * the FIN properly (maybe we get rid of that annoying
1078 * FIN lost hang). The TIME_WRITE code is already
1079 * correct for handling this timeout.
1081 tcp_set_state(sk, TCP_CLOSING);
1082 break;
1083 case TCP_FIN_WAIT2:
1084 /* Received a FIN -- send ACK and enter TIME_WAIT. */
1085 tcp_time_wait(sk);
1086 break;
1087 default:
1088 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1089 * cases we should never reach this piece of code.
1091 printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1092 break;
1096 /* These routines update the SACK block as out-of-order packets arrive or
1097 * in-order packets close up the sequence space.
1099 static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1101 int this_sack, num_sacks = tp->num_sacks;
1102 struct tcp_sack_block *swalk = &tp->selective_acks[0];
1104 /* If more than one SACK block, see if the recent change to SP eats into
1105 * or hits the sequence space of other SACK blocks, if so coalesce.
1107 if(num_sacks != 1) {
1108 for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1109 if(swalk == sp)
1110 continue;
1112 /* First case, bottom of SP moves into top of the
1113 * sequence space of SWALK.
1115 if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1116 sp->start_seq = swalk->start_seq;
1117 goto coalesce;
1119 /* Second case, top of SP moves into bottom of the
1120 * sequence space of SWALK.
1122 if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1123 sp->end_seq = swalk->end_seq;
1124 goto coalesce;
1128 /* SP is the only SACK, or no coalescing cases found. */
1129 return;
1131 coalesce:
1132 /* Zap SWALK, by moving every further SACK up by one slot.
1133 * Decrease num_sacks.
1135 for(this_sack += 1; this_sack < num_sacks-1; this_sack++, swalk++) {
1136 struct tcp_sack_block *next = (swalk + 1);
1137 swalk->start_seq = next->start_seq;
1138 swalk->end_seq = next->end_seq;
1140 tp->num_sacks--;
1143 static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1145 __u32 tmp;
1147 tmp = sack1->start_seq;
1148 sack1->start_seq = sack2->start_seq;
1149 sack2->start_seq = tmp;
1151 tmp = sack1->end_seq;
1152 sack1->end_seq = sack2->end_seq;
1153 sack2->end_seq = tmp;
1156 static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1158 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1159 struct tcp_sack_block *sp = &tp->selective_acks[0];
1160 int cur_sacks = tp->num_sacks;
1162 if (!cur_sacks)
1163 goto new_sack;
1165 /* Optimize for the common case, new ofo frames arrive
1166 * "in order". ;-) This also satisfies the requirements
1167 * of RFC2018 about ordering of SACKs.
1169 if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1170 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1171 tcp_sack_maybe_coalesce(tp, sp);
1172 } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1173 /* Re-ordered arrival, in this case, can be optimized
1174 * as well.
1176 sp->start_seq = TCP_SKB_CB(skb)->seq;
1177 tcp_sack_maybe_coalesce(tp, sp);
1178 } else {
1179 struct tcp_sack_block *swap = sp + 1;
1180 int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1182 /* Oh well, we have to move things around.
1183 * Try to find a SACK we can tack this onto.
1186 for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1187 if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1188 (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1189 if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1190 swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1191 else
1192 swap->start_seq = TCP_SKB_CB(skb)->seq;
1193 tcp_sack_swap(sp, swap);
1194 tcp_sack_maybe_coalesce(tp, sp);
1195 return;
1199 /* Could not find an adjacent existing SACK, build a new one,
1200 * put it at the front, and shift everyone else down. We
1201 * always know there is at least one SACK present already here.
1203 * If the sack array is full, forget about the last one.
1205 if (cur_sacks >= max_sacks) {
1206 cur_sacks--;
1207 tp->num_sacks--;
1209 while(cur_sacks >= 1) {
1210 struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1211 struct tcp_sack_block *prev = (this - 1);
1212 this->start_seq = prev->start_seq;
1213 this->end_seq = prev->end_seq;
1214 cur_sacks--;
1217 new_sack:
1218 /* Build the new head SACK, and we're done. */
1219 sp->start_seq = TCP_SKB_CB(skb)->seq;
1220 sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1221 tp->num_sacks++;
1225 static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1227 struct tcp_sack_block *sp = &tp->selective_acks[0];
1228 int num_sacks = tp->num_sacks;
1229 int this_sack;
1231 /* This is an in order data segment _or_ an out-of-order SKB being
1232 * moved to the receive queue, so we know this removed SKB will eat
1233 * from the front of a SACK.
1235 for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1236 /* Check if the start of the sack is covered by skb. */
1237 if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1238 before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1239 break;
1242 /* This should only happen if so many SACKs get built that some get
1243 * pushed out before we get here, or we eat some in sequence packets
1244 * which are before the first SACK block.
1246 if(this_sack >= num_sacks)
1247 return;
1249 sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1250 if(!before(sp->start_seq, sp->end_seq)) {
1251 /* Zap this SACK, by moving forward any other SACKS. */
1252 for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1253 struct tcp_sack_block *next = (sp + 1);
1254 sp->start_seq = next->start_seq;
1255 sp->end_seq = next->end_seq;
1257 tp->num_sacks--;
1261 static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1263 struct tcp_sack_block *sp = &tp->selective_acks[0];
1264 int num_sacks = tp->num_sacks;
1265 int this_sack;
1267 for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) {
1268 if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1269 break;
1271 if(this_sack >= num_sacks)
1272 return;
1273 sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1276 /* This one checks to see if we can put data from the
1277 * out_of_order queue into the receive_queue.
1279 static void tcp_ofo_queue(struct sock *sk)
1281 struct sk_buff *skb;
1282 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1284 while ((skb = skb_peek(&tp->out_of_order_queue))) {
1285 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1286 break;
1288 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1289 SOCK_DEBUG(sk, "ofo packet was already received \n");
1290 __skb_unlink(skb, skb->list);
1291 kfree_skb(skb);
1292 continue;
1294 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1295 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1296 TCP_SKB_CB(skb)->end_seq);
1298 if(tp->sack_ok)
1299 tcp_sack_remove_skb(tp, skb);
1300 __skb_unlink(skb, skb->list);
1301 __skb_queue_tail(&sk->receive_queue, skb);
1302 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1303 if(skb->h.th->fin)
1304 tcp_fin(skb, sk, skb->h.th);
1308 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1310 struct sk_buff *skb1;
1311 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1313 /* Queue data for delivery to the user.
1314 * Packets in sequence go to the receive queue.
1315 * Out of sequence packets to out_of_order_queue.
1317 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1318 /* Ok. In sequence. */
1319 queue_and_out:
1320 dst_confirm(sk->dst_cache);
1321 __skb_queue_tail(&sk->receive_queue, skb);
1322 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1323 if(skb->h.th->fin) {
1324 tcp_fin(skb, sk, skb->h.th);
1325 } else {
1326 tcp_remember_ack(tp, skb->h.th, skb);
1328 /* This may have eaten into a SACK block. */
1329 if(tp->sack_ok && tp->num_sacks)
1330 tcp_sack_remove_skb(tp, skb);
1331 tcp_ofo_queue(sk);
1333 /* Turn on fast path. */
1334 if (skb_queue_len(&tp->out_of_order_queue) == 0)
1335 tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1336 (0x10 << 16) |
1337 tp->snd_wnd);
1338 return;
1341 /* An old packet, either a retransmit or some packet got lost. */
1342 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1343 /* A retransmit, 2nd most common case. Force an imediate ack. */
1344 SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1345 tcp_enter_quickack_mode(tp);
1346 kfree_skb(skb);
1347 return;
1350 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1351 /* Partial packet, seq < rcv_next < end_seq */
1352 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1353 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1354 TCP_SKB_CB(skb)->end_seq);
1356 goto queue_and_out;
1359 /* Ok. This is an out_of_order segment, force an ack. */
1360 tp->delayed_acks++;
1361 tcp_enter_quickack_mode(tp);
1363 /* Disable header predition. */
1364 tp->pred_flags = 0;
1366 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1367 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1369 if (skb_peek(&tp->out_of_order_queue) == NULL) {
1370 /* Initial out of order segment, build 1 SACK. */
1371 if(tp->sack_ok) {
1372 tp->num_sacks = 1;
1373 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1374 tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1376 __skb_queue_head(&tp->out_of_order_queue,skb);
1377 } else {
1378 for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1379 /* Already there. */
1380 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1381 if (skb->len >= skb1->len) {
1382 if(tp->sack_ok)
1383 tcp_sack_extend(tp, skb1, skb);
1384 __skb_append(skb1, skb);
1385 __skb_unlink(skb1, skb1->list);
1386 kfree_skb(skb1);
1387 } else {
1388 /* A duplicate, smaller than what is in the
1389 * out-of-order queue right now, toss it.
1391 kfree_skb(skb);
1393 break;
1396 if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1397 __skb_append(skb1, skb);
1398 if(tp->sack_ok)
1399 tcp_sack_new_ofo_skb(sk, skb);
1400 break;
1403 /* See if we've hit the start. If so insert. */
1404 if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1405 __skb_queue_head(&tp->out_of_order_queue,skb);
1406 if(tp->sack_ok)
1407 tcp_sack_new_ofo_skb(sk, skb);
1408 break;
1416 * This routine handles the data. If there is room in the buffer,
1417 * it will be have already been moved into it. If there is no
1418 * room, then we will just have to discard the packet.
1421 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1423 struct tcphdr *th;
1424 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1426 th = skb->h.th;
1427 skb_pull(skb, th->doff*4);
1428 skb_trim(skb, len - (th->doff*4));
1430 if (skb->len == 0 && !th->fin)
1431 return(0);
1434 * If our receive queue has grown past its limits shrink it.
1435 * Make sure to do this before moving snd_nxt, otherwise
1436 * data might be acked for that we don't have enough room.
1438 if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
1439 if (prune_queue(sk) < 0) {
1440 /* Still not enough room. That can happen when
1441 * skb->true_size differs significantly from skb->len.
1443 return 0;
1447 tcp_data_queue(sk, skb);
1449 if (before(tp->rcv_nxt, tp->copied_seq)) {
1450 printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1451 tp->rcv_nxt = tp->copied_seq;
1454 /* Above, tcp_data_queue() increments delayed_acks appropriately.
1455 * Now tell the user we may have some data.
1457 if (!sk->dead) {
1458 SOCK_DEBUG(sk, "Data wakeup.\n");
1459 sk->data_ready(sk,0);
1461 return(1);
1464 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1466 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1468 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1469 tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1470 /* Put more data onto the wire. */
1471 tcp_write_xmit(sk);
1472 } else if (tp->packets_out == 0 && !tp->pending) {
1473 /* Start probing the receivers window. */
1474 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1478 static __inline__ void tcp_data_snd_check(struct sock *sk)
1480 struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1482 if (skb != NULL)
1483 __tcp_data_snd_check(sk, skb);
1487 * Adapt the MSS value used to make delayed ack decision to the
1488 * real world.
1490 static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1492 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1493 unsigned int len = skb->len, lss;
1495 if (len > tp->rcv_mss)
1496 tp->rcv_mss = len;
1497 lss = tp->last_seg_size;
1498 tp->last_seg_size = 0;
1499 if (len >= 536) {
1500 if (len == lss)
1501 tp->rcv_mss = len;
1502 tp->last_seg_size = len;
1507 * Check if sending an ack is needed.
1509 static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1511 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1513 /* This also takes care of updating the window.
1514 * This if statement needs to be simplified.
1516 * Rules for delaying an ack:
1517 * - delay time <= 0.5 HZ
1518 * - we don't have a window update to send
1519 * - must send at least every 2 full sized packets
1520 * - must send an ACK if we have any out of order data
1522 * With an extra heuristic to handle loss of packet
1523 * situations and also helping the sender leave slow
1524 * start in an expediant manner.
1527 /* Two full frames received or... */
1528 if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1529 /* We will update the window "significantly" or... */
1530 tcp_raise_window(sk) ||
1531 /* We entered "quick ACK" mode or... */
1532 tcp_in_quickack_mode(tp) ||
1533 /* We have out of order data */
1534 (skb_peek(&tp->out_of_order_queue) != NULL)) {
1535 /* Then ack it now */
1536 tcp_send_ack(sk);
1537 } else {
1538 /* Else, send delayed ack. */
1539 tcp_send_delayed_ack(tp, HZ/2);
1543 static __inline__ void tcp_ack_snd_check(struct sock *sk)
1545 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1546 if (tp->delayed_acks == 0) {
1547 /* We sent a data segment already. */
1548 return;
1550 __tcp_ack_snd_check(sk);
1555 * This routine is only called when we have urgent data
1556 * signalled. Its the 'slow' part of tcp_urg. It could be
1557 * moved inline now as tcp_urg is only called from one
1558 * place. We handle URGent data wrong. We have to - as
1559 * BSD still doesn't use the correction from RFC961.
1560 * For 1003.1g we should support a new option TCP_STDURG to permit
1561 * either form (or just set the sysctl tcp_stdurg).
1564 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1566 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1567 u32 ptr = ntohs(th->urg_ptr);
1569 if (ptr && !sysctl_tcp_stdurg)
1570 ptr--;
1571 ptr += ntohl(th->seq);
1573 /* Ignore urgent data that we've already seen and read. */
1574 if (after(tp->copied_seq, ptr))
1575 return;
1577 /* Do we already have a newer (or duplicate) urgent pointer? */
1578 if (tp->urg_data && !after(ptr, tp->urg_seq))
1579 return;
1581 /* Tell the world about our new urgent pointer. */
1582 if (sk->proc != 0) {
1583 if (sk->proc > 0)
1584 kill_proc(sk->proc, SIGURG, 1);
1585 else
1586 kill_pg(-sk->proc, SIGURG, 1);
1589 /* We may be adding urgent data when the last byte read was
1590 * urgent. To do this requires some care. We cannot just ignore
1591 * tp->copied_seq since we would read the last urgent byte again
1592 * as data, nor can we alter copied_seq until this data arrives
1593 * or we break the sematics of SIOCATMARK (and thus sockatmark())
1595 if (tp->urg_seq == tp->copied_seq)
1596 tp->copied_seq++; /* Move the copied sequence on correctly */
1597 tp->urg_data = URG_NOTYET;
1598 tp->urg_seq = ptr;
1600 /* Disable header prediction. */
1601 tp->pred_flags = 0;
1604 /* This is the 'fast' part of urgent handling. */
1605 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1607 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1609 /* Check if we get a new urgent pointer - normally not. */
1610 if (th->urg)
1611 tcp_check_urg(sk,th);
1613 /* Do we wait for any urgent data? - normally not... */
1614 if (tp->urg_data == URG_NOTYET) {
1615 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1617 /* Is the urgent pointer pointing into this packet? */
1618 if (ptr < len) {
1619 tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1620 if (!sk->dead)
1621 sk->data_ready(sk,0);
1627 * Clean first the out_of_order queue, then the receive queue until
1628 * the socket is in its memory limits again.
1630 static int prune_queue(struct sock *sk)
1632 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1633 struct sk_buff * skb;
1635 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1637 net_statistics.PruneCalled++;
1639 /* First Clean the out_of_order queue. */
1640 /* Start with the end because there are probably the least
1641 * useful packets (crossing fingers).
1643 while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) {
1644 net_statistics.OfoPruned += skb->len;
1645 kfree_skb(skb);
1646 if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
1647 return 0;
1650 /* Now continue with the receive queue if it wasn't enough.
1651 * But only do this if we are really being abused.
1653 while ((atomic_read(&sk->rmem_alloc) >= (sk->rcvbuf * 2)) &&
1654 (skb = skb_peek_tail(&sk->receive_queue))) {
1655 /* Never toss anything when we've seen the FIN.
1656 * It's just too complex to recover from it.
1658 if(skb->h.th->fin)
1659 break;
1661 /* Never remove packets that have been already acked */
1662 if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) {
1663 SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n",
1664 tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
1665 tp->last_ack_sent);
1666 return -1;
1669 net_statistics.RcvPruned += skb->len;
1671 __skb_unlink(skb, skb->list);
1672 tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
1673 SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
1674 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1675 tp->copied_seq);
1676 kfree_skb(skb);
1678 return 0;
1682 * TCP receive function for the ESTABLISHED state.
1684 * It is split into a fast path and a slow path. The fast path is
1685 * disabled when:
1686 * - A zero window was announced from us - zero window probing
1687 * is only handled properly in the slow path.
1688 * - Out of order segments arrived.
1689 * - Urgent data is expected.
1690 * - There is no buffer space left
1691 * - Unexpected TCP flags/window values/header lengths are received
1692 * (detected by checking the TCP header against pred_flags)
1693 * - Data is sent in both directions. Fast path only supports pure senders
1694 * or pure receivers (this means either the sequence number or the ack
1695 * value must stay constant)
1697 * When these conditions are not satisfied it drops into a standard
1698 * receive procedure patterned after RFC793 to handle all cases.
1699 * The first three cases are guaranteed by proper pred_flags setting,
1700 * the rest is checked inline. Fast processing is turned on in
1701 * tcp_data_queue when everything is OK.
1703 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1704 struct tcphdr *th, unsigned len)
1706 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1707 int queued;
1708 u32 flg;
1711 * Header prediction.
1712 * The code follows the one in the famous
1713 * "30 instruction TCP receive" Van Jacobson mail.
1715 * Van's trick is to deposit buffers into socket queue
1716 * on a device interrupt, to call tcp_recv function
1717 * on the receive process context and checksum and copy
1718 * the buffer to user space. smart...
1720 * Our current scheme is not silly either but we take the
1721 * extra cost of the net_bh soft interrupt processing...
1722 * We do checksum and copy also but from device to kernel.
1726 * RFC1323: H1. Apply PAWS check first.
1728 if (tcp_fast_parse_options(sk, th, tp)) {
1729 if (tp->saw_tstamp) {
1730 if (tcp_paws_discard(tp, th, len)) {
1731 if (!th->rst) {
1732 tcp_send_ack(sk);
1733 goto discard;
1736 tcp_replace_ts_recent(sk, tp,
1737 TCP_SKB_CB(skb)->seq,
1738 TCP_SKB_CB(skb)->end_seq);
1742 flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1744 /* pred_flags is 0xS?10 << 16 + snd_wnd
1745 * if header_predition is to be made
1746 * 'S' will always be tp->tcp_header_len >> 2
1747 * '?' will be 0 else it will be !0
1748 * (when there are holes in the receive
1749 * space for instance)
1750 * PSH flag is ignored.
1753 if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1754 if (len <= th->doff*4) {
1755 /* Bulk data transfer: sender */
1756 if (len == th->doff*4) {
1757 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1758 TCP_SKB_CB(skb)->ack_seq, len);
1759 kfree_skb(skb);
1760 tcp_data_snd_check(sk);
1761 return 0;
1762 } else { /* Header too small */
1763 tcp_statistics.TcpInErrs++;
1764 goto discard;
1766 } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1767 atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1768 /* Bulk data transfer: receiver */
1769 __skb_pull(skb,th->doff*4);
1771 tcp_measure_rcv_mss(sk, skb);
1773 /* DO NOT notify forward progress here.
1774 * It saves dozen of CPU instructions in fast path. --ANK
1776 __skb_queue_tail(&sk->receive_queue, skb);
1777 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1779 /* FIN bit check is not done since if FIN is set in
1780 * this frame, the pred_flags won't match up. -DaveM
1782 sk->data_ready(sk, 0);
1783 tcp_delack_estimator(tp);
1785 tcp_remember_ack(tp, th, skb);
1787 __tcp_ack_snd_check(sk);
1788 return 0;
1793 * Standard slow path.
1796 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1797 /* RFC793, page 37: "In all states except SYN-SENT, all reset
1798 * (RST) segments are validated by checking their SEQ-fields."
1799 * And page 69: "If an incoming segment is not acceptable,
1800 * an acknowledgment should be sent in reply (unless the RST bit
1801 * is set, if so drop the segment and return)".
1803 if (th->rst)
1804 goto discard;
1805 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1806 SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1807 TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1808 tp->rcv_wup, tp->rcv_wnd);
1810 tcp_send_ack(sk);
1811 goto discard;
1814 if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1815 SOCK_DEBUG(sk, "syn in established state\n");
1816 tcp_statistics.TcpInErrs++;
1817 tcp_reset(sk, skb);
1818 return 1;
1821 if(th->rst) {
1822 tcp_reset(sk,skb);
1823 goto discard;
1826 if(th->ack)
1827 tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1829 /* Process urgent data. */
1830 tcp_urg(sk, th, len);
1832 /* step 7: process the segment text */
1833 queued = tcp_data(skb, sk, len);
1835 /* This must be after tcp_data() does the skb_pull() to
1836 * remove the header size from skb->len.
1838 * Dave!!! Phrase above (and all about rcv_mss) has
1839 * nothing to do with reality. rcv_mss must measure TOTAL
1840 * size, including sacks, IP options etc. Hence, measure_rcv_mss
1841 * must occure before pulling etc, otherwise it will flap
1842 * like hell. Even putting it before tcp_data is wrong,
1843 * it should use skb->tail - skb->nh.raw instead.
1844 * --ANK (980805)
1846 * BTW I broke it. Now all TCP options are handled equally
1847 * in mss_clamp calculations (i.e. ignored, rfc1122),
1848 * and mss_cache does include all of them (i.e. tstamps)
1849 * except for sacks, to calulate effective mss faster.
1850 * --ANK (980805)
1852 tcp_measure_rcv_mss(sk, skb);
1854 /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1855 if(sk->state != TCP_CLOSE) {
1856 tcp_data_snd_check(sk);
1857 tcp_ack_snd_check(sk);
1860 if (!queued) {
1861 discard:
1862 kfree_skb(skb);
1865 return 0;
1869 * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1870 * as an open_request.
1873 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
1874 struct open_request *req)
1876 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1877 u32 flg;
1879 /* assumption: the socket is not in use.
1880 * as we checked the user count on tcp_rcv and we're
1881 * running from a soft interrupt.
1884 /* Check for syn retransmission */
1885 flg = *(((u32 *)skb->h.th) + 3);
1887 flg &= __constant_htonl(0x00170000);
1888 /* Only SYN set? */
1889 if (flg == __constant_htonl(0x00020000)) {
1890 if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1891 /* retransmited syn.
1893 req->class->rtx_syn_ack(sk, req);
1894 return NULL;
1895 } else {
1896 return sk; /* Pass new SYN to the listen socket. */
1900 /* We know it's an ACK here */
1901 if (req->sk) {
1902 /* socket already created but not
1903 * yet accepted()...
1905 sk = req->sk;
1906 } else {
1907 /* In theory the packet could be for a cookie, but
1908 * TIME_WAIT should guard us against this.
1909 * XXX: Nevertheless check for cookies?
1910 * This sequence number check is done again later,
1911 * but we do it here to prevent syn flood attackers
1912 * from creating big SYN_RECV sockets.
1914 if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
1915 !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
1916 req->rcv_isn+1+req->rcv_wnd)) {
1917 req->class->send_reset(skb);
1918 return NULL;
1921 sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1922 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1923 if (sk == NULL)
1924 return NULL;
1926 req->expires = 0UL;
1927 req->sk = sk;
1929 skb_orphan(skb);
1930 skb_set_owner_r(skb, sk);
1931 return sk;
1935 * This function implements the receiving procedure of RFC 793 for
1936 * all states except ESTABLISHED and TIME_WAIT.
1937 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
1938 * address independent.
1941 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
1942 struct tcphdr *th, unsigned len)
1944 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1945 int queued = 0;
1947 /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */
1948 switch (sk->state) {
1949 case TCP_LISTEN:
1950 /* These use the socket TOS..
1951 * might want to be the received TOS
1953 if(th->ack)
1954 return 1;
1956 if(th->syn) {
1957 if(tp->af_specific->conn_request(sk, skb, 0) < 0)
1958 return 1;
1960 /* Now we have several options: In theory there is
1961 * nothing else in the frame. KA9Q has an option to
1962 * send data with the syn, BSD accepts data with the
1963 * syn up to the [to be] advertised window and
1964 * Solaris 2.1 gives you a protocol error. For now
1965 * we just ignore it, that fits the spec precisely
1966 * and avoids incompatibilities. It would be nice in
1967 * future to drop through and process the data.
1969 * Now that TTCP is starting to be used we ought to
1970 * queue this data.
1971 * But, this leaves one open to an easy denial of
1972 * service attack, and SYN cookies can't defend
1973 * against this problem. So, we drop the data
1974 * in the interest of security over speed.
1976 goto discard;
1979 goto discard;
1980 break;
1982 case TCP_SYN_SENT:
1983 /* SYN sent means we have to look for a suitable ack and
1984 * either reset for bad matches or go to connected.
1985 * The SYN_SENT case is unusual and should
1986 * not be in line code. [AC]
1988 if(th->ack) {
1989 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
1991 /* We got an ack, but it's not a good ack. */
1992 if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
1993 TCP_SKB_CB(skb)->ack_seq, len)) {
1994 sk->err = ECONNRESET;
1995 sk->state_change(sk);
1996 tcp_statistics.TcpAttemptFails++;
1997 return 1;
2000 if(th->rst) {
2001 tcp_reset(sk,skb);
2002 goto discard;
2005 if(!th->syn) {
2006 /* A valid ack from a different connection
2007 * start. Shouldn't happen but cover it.
2009 sk->err = ECONNRESET;
2010 sk->state_change(sk);
2011 tcp_statistics.TcpAttemptFails++;
2012 return 1;
2015 /* Ok.. it's good. Set up sequence numbers and
2016 * move to established.
2018 tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2019 tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2021 /* RFC1323: The window in SYN & SYN/ACK segments is
2022 * never scaled.
2024 tp->snd_wnd = htons(th->window);
2025 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2026 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2027 tp->fin_seq = TCP_SKB_CB(skb)->seq;
2029 tcp_set_state(sk, TCP_ESTABLISHED);
2030 tcp_parse_options(sk, th, tp, 0);
2032 if (tp->wscale_ok == 0) {
2033 tp->snd_wscale = tp->rcv_wscale = 0;
2034 tp->window_clamp = min(tp->window_clamp,65535);
2037 if (tp->tstamp_ok) {
2038 tp->tcp_header_len =
2039 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2040 } else
2041 tp->tcp_header_len = sizeof(struct tcphdr);
2042 if (tp->saw_tstamp) {
2043 tp->ts_recent = tp->rcv_tsval;
2044 tp->ts_recent_stamp = jiffies;
2047 /* Can't be earlier, doff would be wrong. */
2048 tcp_send_ack(sk);
2050 sk->dport = th->source;
2051 tp->copied_seq = tp->rcv_nxt;
2053 if(!sk->dead) {
2054 sk->state_change(sk);
2055 sock_wake_async(sk->socket, 0);
2057 } else {
2058 if(th->syn && !th->rst) {
2059 /* The previous version of the code
2060 * checked for "connecting to self"
2061 * here. that check is done now in
2062 * tcp_connect.
2064 tcp_set_state(sk, TCP_SYN_RECV);
2065 tcp_parse_options(sk, th, tp, 0);
2066 if (tp->saw_tstamp) {
2067 tp->ts_recent = tp->rcv_tsval;
2068 tp->ts_recent_stamp = jiffies;
2071 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2072 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2074 /* RFC1323: The window in SYN & SYN/ACK segments is
2075 * never scaled.
2077 tp->snd_wnd = htons(th->window);
2078 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2080 tcp_send_synack(sk);
2081 } else
2082 break;
2085 /* tp->tcp_header_len and tp->mss_clamp
2086 probably changed, synchronize mss.
2088 tcp_sync_mss(sk, tp->pmtu_cookie);
2089 tp->rcv_mss = tp->mss_cache;
2091 if (sk->state == TCP_SYN_RECV)
2092 goto discard;
2094 goto step6;
2097 /* Parse the tcp_options present on this header.
2098 * By this point we really only expect timestamps.
2099 * Note that this really has to be here and not later for PAWS
2100 * (RFC1323) to work.
2102 if (tcp_fast_parse_options(sk, th, tp)) {
2103 /* NOTE: assumes saw_tstamp is never set if we didn't
2104 * negotiate the option. tcp_fast_parse_options() must
2105 * guarantee this.
2107 if (tp->saw_tstamp) {
2108 if (tcp_paws_discard(tp, th, len)) {
2109 if (!th->rst) {
2110 tcp_send_ack(sk);
2111 goto discard;
2114 tcp_replace_ts_recent(sk, tp,
2115 TCP_SKB_CB(skb)->seq,
2116 TCP_SKB_CB(skb)->end_seq);
2120 /* step 1: check sequence number */
2121 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2122 if (!th->rst) {
2123 tcp_send_ack(sk);
2124 goto discard;
2128 /* step 2: check RST bit */
2129 if(th->rst) {
2130 tcp_reset(sk,skb);
2131 goto discard;
2134 /* step 3: check security and precedence [ignored] */
2136 /* step 4:
2138 * Check for a SYN, and ensure it matches the SYN we were
2139 * first sent. We have to handle the rather unusual (but valid)
2140 * sequence that KA9Q derived products may generate of
2142 * SYN
2143 * SYN|ACK Data
2144 * ACK (lost)
2145 * SYN|ACK Data + More Data
2146 * .. we must ACK not RST...
2148 * We keep syn_seq as the sequence space occupied by the
2149 * original syn.
2152 if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2153 tcp_reset(sk, skb);
2154 return 1;
2157 /* step 5: check the ACK field */
2158 if (th->ack) {
2159 int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2160 TCP_SKB_CB(skb)->ack_seq, len);
2162 switch(sk->state) {
2163 case TCP_SYN_RECV:
2164 if (acceptable) {
2165 tcp_set_state(sk, TCP_ESTABLISHED);
2166 sk->dport = th->source;
2167 tp->copied_seq = tp->rcv_nxt;
2169 if(!sk->dead)
2170 sk->state_change(sk);
2172 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2173 tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2174 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2175 tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2177 } else {
2178 SOCK_DEBUG(sk, "bad ack\n");
2179 return 1;
2181 break;
2183 case TCP_FIN_WAIT1:
2184 if (tp->snd_una == tp->write_seq) {
2185 sk->shutdown |= SEND_SHUTDOWN;
2186 tcp_set_state(sk, TCP_FIN_WAIT2);
2187 if (!sk->dead)
2188 sk->state_change(sk);
2189 else
2190 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2192 break;
2194 case TCP_CLOSING:
2195 if (tp->snd_una == tp->write_seq) {
2196 tcp_time_wait(sk);
2197 goto discard;
2199 break;
2201 case TCP_LAST_ACK:
2202 if (tp->snd_una == tp->write_seq) {
2203 sk->shutdown = SHUTDOWN_MASK;
2204 tcp_set_state(sk,TCP_CLOSE);
2205 if (!sk->dead)
2206 sk->state_change(sk);
2207 goto discard;
2209 break;
2211 } else
2212 goto discard;
2214 step6:
2215 /* step 6: check the URG bit */
2216 tcp_urg(sk, th, len);
2218 /* step 7: process the segment text */
2219 switch (sk->state) {
2220 case TCP_CLOSE_WAIT:
2221 case TCP_CLOSING:
2222 if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2223 break;
2225 case TCP_FIN_WAIT1:
2226 case TCP_FIN_WAIT2:
2227 /* RFC 793 says to queue data in these states,
2228 * RFC 1122 says we MUST send a reset.
2229 * BSD 4.4 also does reset.
2231 if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2232 if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2233 tcp_reset(sk, skb);
2234 return 1;
2238 case TCP_ESTABLISHED:
2239 queued = tcp_data(skb, sk, len);
2241 /* This must be after tcp_data() does the skb_pull() to
2242 * remove the header size from skb->len.
2244 tcp_measure_rcv_mss(sk, skb);
2245 break;
2248 tcp_data_snd_check(sk);
2249 tcp_ack_snd_check(sk);
2251 if (!queued) {
2252 discard:
2253 kfree_skb(skb);
2255 return 0;