initial commit with v2.6.9
[linux-2.6.9-moxart.git] / net / ipv4 / tcp_timer.c
blobc060bb333471546c6100da01845071d3d6d902ff
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/module.h>
24 #include <net/tcp.h>
26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31 int sysctl_tcp_retries1 = TCP_RETR1;
32 int sysctl_tcp_retries2 = TCP_RETR2;
33 int sysctl_tcp_orphan_retries;
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
39 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
42 * Using different timers for retransmit, delayed acks and probes
43 * We may wish use just one timer maintaining a list of expire jiffies
44 * to optimize.
47 void tcp_init_xmit_timers(struct sock *sk)
49 struct tcp_opt *tp = tcp_sk(sk);
51 init_timer(&tp->retransmit_timer);
52 tp->retransmit_timer.function=&tcp_write_timer;
53 tp->retransmit_timer.data = (unsigned long) sk;
54 tp->pending = 0;
56 init_timer(&tp->delack_timer);
57 tp->delack_timer.function=&tcp_delack_timer;
58 tp->delack_timer.data = (unsigned long) sk;
59 tp->ack.pending = 0;
61 init_timer(&sk->sk_timer);
62 sk->sk_timer.function = &tcp_keepalive_timer;
63 sk->sk_timer.data = (unsigned long)sk;
66 void tcp_clear_xmit_timers(struct sock *sk)
68 struct tcp_opt *tp = tcp_sk(sk);
70 tp->pending = 0;
71 sk_stop_timer(sk, &tp->retransmit_timer);
73 tp->ack.pending = 0;
74 tp->ack.blocked = 0;
75 sk_stop_timer(sk, &tp->delack_timer);
77 sk_stop_timer(sk, &sk->sk_timer);
80 static void tcp_write_err(struct sock *sk)
82 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
83 sk->sk_error_report(sk);
85 tcp_done(sk);
86 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
89 /* Do not allow orphaned sockets to eat all our resources.
90 * This is direct violation of TCP specs, but it is required
91 * to prevent DoS attacks. It is called when a retransmission timeout
92 * or zero probe timeout occurs on orphaned socket.
94 * Criterium is still not confirmed experimentally and may change.
95 * We kill the socket, if:
96 * 1. If number of orphaned sockets exceeds an administratively configured
97 * limit.
98 * 2. If we have strong memory pressure.
100 static int tcp_out_of_resources(struct sock *sk, int do_reset)
102 struct tcp_opt *tp = tcp_sk(sk);
103 int orphans = atomic_read(&tcp_orphan_count);
105 /* If peer does not open window for long time, or did not transmit
106 * anything for long time, penalize it. */
107 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
108 orphans <<= 1;
110 /* If some dubious ICMP arrived, penalize even more. */
111 if (sk->sk_err_soft)
112 orphans <<= 1;
114 if (orphans >= sysctl_tcp_max_orphans ||
115 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
116 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
117 if (net_ratelimit())
118 printk(KERN_INFO "Out of socket memory\n");
120 /* Catch exceptional cases, when connection requires reset.
121 * 1. Last segment was sent recently. */
122 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
123 /* 2. Window is closed. */
124 (!tp->snd_wnd && !tcp_get_pcount(&tp->packets_out)))
125 do_reset = 1;
126 if (do_reset)
127 tcp_send_active_reset(sk, GFP_ATOMIC);
128 tcp_done(sk);
129 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
130 return 1;
132 return 0;
135 /* Calculate maximal number or retries on an orphaned socket. */
136 static int tcp_orphan_retries(struct sock *sk, int alive)
138 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
140 /* We know from an ICMP that something is wrong. */
141 if (sk->sk_err_soft && !alive)
142 retries = 0;
144 /* However, if socket sent something recently, select some safe
145 * number of retries. 8 corresponds to >100 seconds with minimal
146 * RTO of 200msec. */
147 if (retries == 0 && alive)
148 retries = 8;
149 return retries;
152 /* A write timeout has occurred. Process the after effects. */
153 static int tcp_write_timeout(struct sock *sk)
155 struct tcp_opt *tp = tcp_sk(sk);
156 int retry_until;
158 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
159 if (tp->retransmits)
160 dst_negative_advice(&sk->sk_dst_cache);
161 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
162 } else {
163 if (tp->retransmits >= sysctl_tcp_retries1) {
164 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
165 hole detection. :-(
167 It is place to make it. It is not made. I do not want
168 to make it. It is disguisting. It does not work in any
169 case. Let me to cite the same draft, which requires for
170 us to implement this:
172 "The one security concern raised by this memo is that ICMP black holes
173 are often caused by over-zealous security administrators who block
174 all ICMP messages. It is vitally important that those who design and
175 deploy security systems understand the impact of strict filtering on
176 upper-layer protocols. The safest web site in the world is worthless
177 if most TCP implementations cannot transfer data from it. It would
178 be far nicer to have all of the black holes fixed rather than fixing
179 all of the TCP implementations."
181 Golden words :-).
184 dst_negative_advice(&sk->sk_dst_cache);
187 retry_until = sysctl_tcp_retries2;
188 if (sock_flag(sk, SOCK_DEAD)) {
189 int alive = (tp->rto < TCP_RTO_MAX);
191 retry_until = tcp_orphan_retries(sk, alive);
193 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
194 return 1;
198 if (tp->retransmits >= retry_until) {
199 /* Has it gone just too far? */
200 tcp_write_err(sk);
201 return 1;
203 return 0;
206 static void tcp_delack_timer(unsigned long data)
208 struct sock *sk = (struct sock*)data;
209 struct tcp_opt *tp = tcp_sk(sk);
211 bh_lock_sock(sk);
212 if (sock_owned_by_user(sk)) {
213 /* Try again later. */
214 tp->ack.blocked = 1;
215 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
216 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
217 goto out_unlock;
220 sk_stream_mem_reclaim(sk);
222 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
223 goto out;
225 if (time_after(tp->ack.timeout, jiffies)) {
226 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
227 goto out;
229 tp->ack.pending &= ~TCP_ACK_TIMER;
231 if (skb_queue_len(&tp->ucopy.prequeue)) {
232 struct sk_buff *skb;
234 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
235 skb_queue_len(&tp->ucopy.prequeue));
237 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
238 sk->sk_backlog_rcv(sk, skb);
240 tp->ucopy.memory = 0;
243 if (tcp_ack_scheduled(tp)) {
244 if (!tp->ack.pingpong) {
245 /* Delayed ACK missed: inflate ATO. */
246 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
247 } else {
248 /* Delayed ACK missed: leave pingpong mode and
249 * deflate ATO.
251 tp->ack.pingpong = 0;
252 tp->ack.ato = TCP_ATO_MIN;
254 tcp_send_ack(sk);
255 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
257 TCP_CHECK_TIMER(sk);
259 out:
260 if (tcp_memory_pressure)
261 sk_stream_mem_reclaim(sk);
262 out_unlock:
263 bh_unlock_sock(sk);
264 sock_put(sk);
267 static void tcp_probe_timer(struct sock *sk)
269 struct tcp_opt *tp = tcp_sk(sk);
270 int max_probes;
272 if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
273 tp->probes_out = 0;
274 return;
277 /* *WARNING* RFC 1122 forbids this
279 * It doesn't AFAIK, because we kill the retransmit timer -AK
281 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
282 * this behaviour in Solaris down as a bug fix. [AC]
284 * Let me to explain. probes_out is zeroed by incoming ACKs
285 * even if they advertise zero window. Hence, connection is killed only
286 * if we received no ACKs for normal connection timeout. It is not killed
287 * only because window stays zero for some time, window may be zero
288 * until armageddon and even later. We are in full accordance
289 * with RFCs, only probe timer combines both retransmission timeout
290 * and probe timeout in one bottle. --ANK
292 max_probes = sysctl_tcp_retries2;
294 if (sock_flag(sk, SOCK_DEAD)) {
295 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
297 max_probes = tcp_orphan_retries(sk, alive);
299 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
300 return;
303 if (tp->probes_out > max_probes) {
304 tcp_write_err(sk);
305 } else {
306 /* Only send another probe if we didn't close things up. */
307 tcp_send_probe0(sk);
312 * The TCP retransmit timer.
315 static void tcp_retransmit_timer(struct sock *sk)
317 struct tcp_opt *tp = tcp_sk(sk);
319 if (!tcp_get_pcount(&tp->packets_out))
320 goto out;
322 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
324 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
325 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
326 /* Receiver dastardly shrinks window. Our retransmits
327 * become zero probes, but we should not timeout this
328 * connection. If the socket is an orphan, time it out,
329 * we cannot allow such beasts to hang infinitely.
331 #ifdef TCP_DEBUG
332 if (net_ratelimit()) {
333 struct inet_opt *inet = inet_sk(sk);
334 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
335 NIPQUAD(inet->daddr), htons(inet->dport),
336 inet->num, tp->snd_una, tp->snd_nxt);
338 #endif
339 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
340 tcp_write_err(sk);
341 goto out;
343 tcp_enter_loss(sk, 0);
344 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
345 __sk_dst_reset(sk);
346 goto out_reset_timer;
349 if (tcp_write_timeout(sk))
350 goto out;
352 if (tp->retransmits == 0) {
353 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
354 if (tp->sack_ok) {
355 if (tp->ca_state == TCP_CA_Recovery)
356 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
357 else
358 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
359 } else {
360 if (tp->ca_state == TCP_CA_Recovery)
361 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
362 else
363 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
365 } else if (tp->ca_state == TCP_CA_Loss) {
366 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
367 } else {
368 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
372 if (tcp_use_frto(sk)) {
373 tcp_enter_frto(sk);
374 } else {
375 tcp_enter_loss(sk, 0);
378 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
379 /* Retransmission failed because of local congestion,
380 * do not backoff.
382 if (!tp->retransmits)
383 tp->retransmits=1;
384 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
385 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
386 goto out;
389 /* Increase the timeout each time we retransmit. Note that
390 * we do not increase the rtt estimate. rto is initialized
391 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
392 * that doubling rto each time is the least we can get away with.
393 * In KA9Q, Karn uses this for the first few times, and then
394 * goes to quadratic. netBSD doubles, but only goes up to *64,
395 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
396 * defined in the protocol as the maximum possible RTT. I guess
397 * we'll have to use something other than TCP to talk to the
398 * University of Mars.
400 * PAWS allows us longer timeouts and large windows, so once
401 * implemented ftp to mars will work nicely. We will have to fix
402 * the 120 second clamps though!
404 tp->backoff++;
405 tp->retransmits++;
407 out_reset_timer:
408 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
409 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
410 if (tp->retransmits > sysctl_tcp_retries1)
411 __sk_dst_reset(sk);
413 out:;
416 static void tcp_write_timer(unsigned long data)
418 struct sock *sk = (struct sock*)data;
419 struct tcp_opt *tp = tcp_sk(sk);
420 int event;
422 bh_lock_sock(sk);
423 if (sock_owned_by_user(sk)) {
424 /* Try again later */
425 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
426 goto out_unlock;
429 if (sk->sk_state == TCP_CLOSE || !tp->pending)
430 goto out;
432 if (time_after(tp->timeout, jiffies)) {
433 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
434 goto out;
437 event = tp->pending;
438 tp->pending = 0;
440 switch (event) {
441 case TCP_TIME_RETRANS:
442 tcp_retransmit_timer(sk);
443 break;
444 case TCP_TIME_PROBE0:
445 tcp_probe_timer(sk);
446 break;
448 TCP_CHECK_TIMER(sk);
450 out:
451 sk_stream_mem_reclaim(sk);
452 out_unlock:
453 bh_unlock_sock(sk);
454 sock_put(sk);
458 * Timer for listening sockets
461 static void tcp_synack_timer(struct sock *sk)
463 struct tcp_opt *tp = tcp_sk(sk);
464 struct tcp_listen_opt *lopt = tp->listen_opt;
465 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
466 int thresh = max_retries;
467 unsigned long now = jiffies;
468 struct open_request **reqp, *req;
469 int i, budget;
471 if (lopt == NULL || lopt->qlen == 0)
472 return;
474 /* Normally all the openreqs are young and become mature
475 * (i.e. converted to established socket) for first timeout.
476 * If synack was not acknowledged for 3 seconds, it means
477 * one of the following things: synack was lost, ack was lost,
478 * rtt is high or nobody planned to ack (i.e. synflood).
479 * When server is a bit loaded, queue is populated with old
480 * open requests, reducing effective size of queue.
481 * When server is well loaded, queue size reduces to zero
482 * after several minutes of work. It is not synflood,
483 * it is normal operation. The solution is pruning
484 * too old entries overriding normal timeout, when
485 * situation becomes dangerous.
487 * Essentially, we reserve half of room for young
488 * embrions; and abort old ones without pity, if old
489 * ones are about to clog our table.
491 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
492 int young = (lopt->qlen_young<<1);
494 while (thresh > 2) {
495 if (lopt->qlen < young)
496 break;
497 thresh--;
498 young <<= 1;
502 if (tp->defer_accept)
503 max_retries = tp->defer_accept;
505 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
506 i = lopt->clock_hand;
508 do {
509 reqp=&lopt->syn_table[i];
510 while ((req = *reqp) != NULL) {
511 if (time_after_eq(now, req->expires)) {
512 if ((req->retrans < thresh ||
513 (req->acked && req->retrans < max_retries))
514 && !req->class->rtx_syn_ack(sk, req, NULL)) {
515 unsigned long timeo;
517 if (req->retrans++ == 0)
518 lopt->qlen_young--;
519 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
520 TCP_RTO_MAX);
521 req->expires = now + timeo;
522 reqp = &req->dl_next;
523 continue;
526 /* Drop this request */
527 write_lock(&tp->syn_wait_lock);
528 *reqp = req->dl_next;
529 write_unlock(&tp->syn_wait_lock);
530 lopt->qlen--;
531 if (req->retrans == 0)
532 lopt->qlen_young--;
533 tcp_openreq_free(req);
534 continue;
536 reqp = &req->dl_next;
539 i = (i+1)&(TCP_SYNQ_HSIZE-1);
541 } while (--budget > 0);
543 lopt->clock_hand = i;
545 if (lopt->qlen)
546 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
549 void tcp_delete_keepalive_timer (struct sock *sk)
551 sk_stop_timer(sk, &sk->sk_timer);
554 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
556 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
559 void tcp_set_keepalive(struct sock *sk, int val)
561 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
562 return;
564 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
565 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
566 else if (!val)
567 tcp_delete_keepalive_timer(sk);
571 static void tcp_keepalive_timer (unsigned long data)
573 struct sock *sk = (struct sock *) data;
574 struct tcp_opt *tp = tcp_sk(sk);
575 __u32 elapsed;
577 /* Only process if socket is not in use. */
578 bh_lock_sock(sk);
579 if (sock_owned_by_user(sk)) {
580 /* Try again later. */
581 tcp_reset_keepalive_timer (sk, HZ/20);
582 goto out;
585 if (sk->sk_state == TCP_LISTEN) {
586 tcp_synack_timer(sk);
587 goto out;
590 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
591 if (tp->linger2 >= 0) {
592 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
594 if (tmo > 0) {
595 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
596 goto out;
599 tcp_send_active_reset(sk, GFP_ATOMIC);
600 goto death;
603 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
604 goto out;
606 elapsed = keepalive_time_when(tp);
608 /* It is alive without keepalive 8) */
609 if (tcp_get_pcount(&tp->packets_out) || sk->sk_send_head)
610 goto resched;
612 elapsed = tcp_time_stamp - tp->rcv_tstamp;
614 if (elapsed >= keepalive_time_when(tp)) {
615 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
616 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
617 tcp_send_active_reset(sk, GFP_ATOMIC);
618 tcp_write_err(sk);
619 goto out;
621 if (tcp_write_wakeup(sk) <= 0) {
622 tp->probes_out++;
623 elapsed = keepalive_intvl_when(tp);
624 } else {
625 /* If keepalive was lost due to local congestion,
626 * try harder.
628 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
630 } else {
631 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
632 elapsed = keepalive_time_when(tp) - elapsed;
635 TCP_CHECK_TIMER(sk);
636 sk_stream_mem_reclaim(sk);
638 resched:
639 tcp_reset_keepalive_timer (sk, elapsed);
640 goto out;
642 death:
643 tcp_done(sk);
645 out:
646 bh_unlock_sock(sk);
647 sock_put(sk);
650 EXPORT_SYMBOL(tcp_clear_xmit_timers);
651 EXPORT_SYMBOL(tcp_delete_keepalive_timer);
652 EXPORT_SYMBOL(tcp_init_xmit_timers);
653 EXPORT_SYMBOL(tcp_reset_keepalive_timer);