Linux-2.6.12-rc2
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / tcp_timer.c
blob85b279f1e935b0e2d5870485d1003287be8848d1
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/module.h>
24 #include <net/tcp.h>
26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31 int sysctl_tcp_retries1 = TCP_RETR1;
32 int sysctl_tcp_retries2 = TCP_RETR2;
33 int sysctl_tcp_orphan_retries;
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
39 #ifdef TCP_DEBUG
40 const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41 EXPORT_SYMBOL(tcp_timer_bug_msg);
42 #endif
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
47 * to optimize.
50 void tcp_init_xmit_timers(struct sock *sk)
52 struct tcp_sock *tp = tcp_sk(sk);
54 init_timer(&tp->retransmit_timer);
55 tp->retransmit_timer.function=&tcp_write_timer;
56 tp->retransmit_timer.data = (unsigned long) sk;
57 tp->pending = 0;
59 init_timer(&tp->delack_timer);
60 tp->delack_timer.function=&tcp_delack_timer;
61 tp->delack_timer.data = (unsigned long) sk;
62 tp->ack.pending = 0;
64 init_timer(&sk->sk_timer);
65 sk->sk_timer.function = &tcp_keepalive_timer;
66 sk->sk_timer.data = (unsigned long)sk;
69 void tcp_clear_xmit_timers(struct sock *sk)
71 struct tcp_sock *tp = tcp_sk(sk);
73 tp->pending = 0;
74 sk_stop_timer(sk, &tp->retransmit_timer);
76 tp->ack.pending = 0;
77 tp->ack.blocked = 0;
78 sk_stop_timer(sk, &tp->delack_timer);
80 sk_stop_timer(sk, &sk->sk_timer);
83 static void tcp_write_err(struct sock *sk)
85 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
86 sk->sk_error_report(sk);
88 tcp_done(sk);
89 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
92 /* Do not allow orphaned sockets to eat all our resources.
93 * This is direct violation of TCP specs, but it is required
94 * to prevent DoS attacks. It is called when a retransmission timeout
95 * or zero probe timeout occurs on orphaned socket.
97 * Criterium is still not confirmed experimentally and may change.
98 * We kill the socket, if:
99 * 1. If number of orphaned sockets exceeds an administratively configured
100 * limit.
101 * 2. If we have strong memory pressure.
103 static int tcp_out_of_resources(struct sock *sk, int do_reset)
105 struct tcp_sock *tp = tcp_sk(sk);
106 int orphans = atomic_read(&tcp_orphan_count);
108 /* If peer does not open window for long time, or did not transmit
109 * anything for long time, penalize it. */
110 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
111 orphans <<= 1;
113 /* If some dubious ICMP arrived, penalize even more. */
114 if (sk->sk_err_soft)
115 orphans <<= 1;
117 if (orphans >= sysctl_tcp_max_orphans ||
118 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
119 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
120 if (net_ratelimit())
121 printk(KERN_INFO "Out of socket memory\n");
123 /* Catch exceptional cases, when connection requires reset.
124 * 1. Last segment was sent recently. */
125 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
126 /* 2. Window is closed. */
127 (!tp->snd_wnd && !tp->packets_out))
128 do_reset = 1;
129 if (do_reset)
130 tcp_send_active_reset(sk, GFP_ATOMIC);
131 tcp_done(sk);
132 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
133 return 1;
135 return 0;
138 /* Calculate maximal number or retries on an orphaned socket. */
139 static int tcp_orphan_retries(struct sock *sk, int alive)
141 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
143 /* We know from an ICMP that something is wrong. */
144 if (sk->sk_err_soft && !alive)
145 retries = 0;
147 /* However, if socket sent something recently, select some safe
148 * number of retries. 8 corresponds to >100 seconds with minimal
149 * RTO of 200msec. */
150 if (retries == 0 && alive)
151 retries = 8;
152 return retries;
155 /* A write timeout has occurred. Process the after effects. */
156 static int tcp_write_timeout(struct sock *sk)
158 struct tcp_sock *tp = tcp_sk(sk);
159 int retry_until;
161 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
162 if (tp->retransmits)
163 dst_negative_advice(&sk->sk_dst_cache);
164 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
165 } else {
166 if (tp->retransmits >= sysctl_tcp_retries1) {
167 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
168 hole detection. :-(
170 It is place to make it. It is not made. I do not want
171 to make it. It is disguisting. It does not work in any
172 case. Let me to cite the same draft, which requires for
173 us to implement this:
175 "The one security concern raised by this memo is that ICMP black holes
176 are often caused by over-zealous security administrators who block
177 all ICMP messages. It is vitally important that those who design and
178 deploy security systems understand the impact of strict filtering on
179 upper-layer protocols. The safest web site in the world is worthless
180 if most TCP implementations cannot transfer data from it. It would
181 be far nicer to have all of the black holes fixed rather than fixing
182 all of the TCP implementations."
184 Golden words :-).
187 dst_negative_advice(&sk->sk_dst_cache);
190 retry_until = sysctl_tcp_retries2;
191 if (sock_flag(sk, SOCK_DEAD)) {
192 int alive = (tp->rto < TCP_RTO_MAX);
194 retry_until = tcp_orphan_retries(sk, alive);
196 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
197 return 1;
201 if (tp->retransmits >= retry_until) {
202 /* Has it gone just too far? */
203 tcp_write_err(sk);
204 return 1;
206 return 0;
209 static void tcp_delack_timer(unsigned long data)
211 struct sock *sk = (struct sock*)data;
212 struct tcp_sock *tp = tcp_sk(sk);
214 bh_lock_sock(sk);
215 if (sock_owned_by_user(sk)) {
216 /* Try again later. */
217 tp->ack.blocked = 1;
218 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
219 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
220 goto out_unlock;
223 sk_stream_mem_reclaim(sk);
225 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
226 goto out;
228 if (time_after(tp->ack.timeout, jiffies)) {
229 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
230 goto out;
232 tp->ack.pending &= ~TCP_ACK_TIMER;
234 if (skb_queue_len(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb;
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
238 skb_queue_len(&tp->ucopy.prequeue));
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb);
243 tp->ucopy.memory = 0;
246 if (tcp_ack_scheduled(tp)) {
247 if (!tp->ack.pingpong) {
248 /* Delayed ACK missed: inflate ATO. */
249 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
250 } else {
251 /* Delayed ACK missed: leave pingpong mode and
252 * deflate ATO.
254 tp->ack.pingpong = 0;
255 tp->ack.ato = TCP_ATO_MIN;
257 tcp_send_ack(sk);
258 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
260 TCP_CHECK_TIMER(sk);
262 out:
263 if (tcp_memory_pressure)
264 sk_stream_mem_reclaim(sk);
265 out_unlock:
266 bh_unlock_sock(sk);
267 sock_put(sk);
270 static void tcp_probe_timer(struct sock *sk)
272 struct tcp_sock *tp = tcp_sk(sk);
273 int max_probes;
275 if (tp->packets_out || !sk->sk_send_head) {
276 tp->probes_out = 0;
277 return;
280 /* *WARNING* RFC 1122 forbids this
282 * It doesn't AFAIK, because we kill the retransmit timer -AK
284 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
285 * this behaviour in Solaris down as a bug fix. [AC]
287 * Let me to explain. probes_out is zeroed by incoming ACKs
288 * even if they advertise zero window. Hence, connection is killed only
289 * if we received no ACKs for normal connection timeout. It is not killed
290 * only because window stays zero for some time, window may be zero
291 * until armageddon and even later. We are in full accordance
292 * with RFCs, only probe timer combines both retransmission timeout
293 * and probe timeout in one bottle. --ANK
295 max_probes = sysctl_tcp_retries2;
297 if (sock_flag(sk, SOCK_DEAD)) {
298 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
300 max_probes = tcp_orphan_retries(sk, alive);
302 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
303 return;
306 if (tp->probes_out > max_probes) {
307 tcp_write_err(sk);
308 } else {
309 /* Only send another probe if we didn't close things up. */
310 tcp_send_probe0(sk);
315 * The TCP retransmit timer.
318 static void tcp_retransmit_timer(struct sock *sk)
320 struct tcp_sock *tp = tcp_sk(sk);
322 if (!tp->packets_out)
323 goto out;
325 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
327 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
328 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
329 /* Receiver dastardly shrinks window. Our retransmits
330 * become zero probes, but we should not timeout this
331 * connection. If the socket is an orphan, time it out,
332 * we cannot allow such beasts to hang infinitely.
334 #ifdef TCP_DEBUG
335 if (net_ratelimit()) {
336 struct inet_sock *inet = inet_sk(sk);
337 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
338 NIPQUAD(inet->daddr), htons(inet->dport),
339 inet->num, tp->snd_una, tp->snd_nxt);
341 #endif
342 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
343 tcp_write_err(sk);
344 goto out;
346 tcp_enter_loss(sk, 0);
347 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
348 __sk_dst_reset(sk);
349 goto out_reset_timer;
352 if (tcp_write_timeout(sk))
353 goto out;
355 if (tp->retransmits == 0) {
356 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
357 if (tp->rx_opt.sack_ok) {
358 if (tp->ca_state == TCP_CA_Recovery)
359 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
360 else
361 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
362 } else {
363 if (tp->ca_state == TCP_CA_Recovery)
364 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
365 else
366 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
368 } else if (tp->ca_state == TCP_CA_Loss) {
369 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
370 } else {
371 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
375 if (tcp_use_frto(sk)) {
376 tcp_enter_frto(sk);
377 } else {
378 tcp_enter_loss(sk, 0);
381 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
382 /* Retransmission failed because of local congestion,
383 * do not backoff.
385 if (!tp->retransmits)
386 tp->retransmits=1;
387 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
388 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
389 goto out;
392 /* Increase the timeout each time we retransmit. Note that
393 * we do not increase the rtt estimate. rto is initialized
394 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
395 * that doubling rto each time is the least we can get away with.
396 * In KA9Q, Karn uses this for the first few times, and then
397 * goes to quadratic. netBSD doubles, but only goes up to *64,
398 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
399 * defined in the protocol as the maximum possible RTT. I guess
400 * we'll have to use something other than TCP to talk to the
401 * University of Mars.
403 * PAWS allows us longer timeouts and large windows, so once
404 * implemented ftp to mars will work nicely. We will have to fix
405 * the 120 second clamps though!
407 tp->backoff++;
408 tp->retransmits++;
410 out_reset_timer:
411 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
412 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
413 if (tp->retransmits > sysctl_tcp_retries1)
414 __sk_dst_reset(sk);
416 out:;
419 static void tcp_write_timer(unsigned long data)
421 struct sock *sk = (struct sock*)data;
422 struct tcp_sock *tp = tcp_sk(sk);
423 int event;
425 bh_lock_sock(sk);
426 if (sock_owned_by_user(sk)) {
427 /* Try again later */
428 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
429 goto out_unlock;
432 if (sk->sk_state == TCP_CLOSE || !tp->pending)
433 goto out;
435 if (time_after(tp->timeout, jiffies)) {
436 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
437 goto out;
440 event = tp->pending;
441 tp->pending = 0;
443 switch (event) {
444 case TCP_TIME_RETRANS:
445 tcp_retransmit_timer(sk);
446 break;
447 case TCP_TIME_PROBE0:
448 tcp_probe_timer(sk);
449 break;
451 TCP_CHECK_TIMER(sk);
453 out:
454 sk_stream_mem_reclaim(sk);
455 out_unlock:
456 bh_unlock_sock(sk);
457 sock_put(sk);
461 * Timer for listening sockets
464 static void tcp_synack_timer(struct sock *sk)
466 struct tcp_sock *tp = tcp_sk(sk);
467 struct tcp_listen_opt *lopt = tp->listen_opt;
468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469 int thresh = max_retries;
470 unsigned long now = jiffies;
471 struct open_request **reqp, *req;
472 int i, budget;
474 if (lopt == NULL || lopt->qlen == 0)
475 return;
477 /* Normally all the openreqs are young and become mature
478 * (i.e. converted to established socket) for first timeout.
479 * If synack was not acknowledged for 3 seconds, it means
480 * one of the following things: synack was lost, ack was lost,
481 * rtt is high or nobody planned to ack (i.e. synflood).
482 * When server is a bit loaded, queue is populated with old
483 * open requests, reducing effective size of queue.
484 * When server is well loaded, queue size reduces to zero
485 * after several minutes of work. It is not synflood,
486 * it is normal operation. The solution is pruning
487 * too old entries overriding normal timeout, when
488 * situation becomes dangerous.
490 * Essentially, we reserve half of room for young
491 * embrions; and abort old ones without pity, if old
492 * ones are about to clog our table.
494 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
495 int young = (lopt->qlen_young<<1);
497 while (thresh > 2) {
498 if (lopt->qlen < young)
499 break;
500 thresh--;
501 young <<= 1;
505 if (tp->defer_accept)
506 max_retries = tp->defer_accept;
508 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
509 i = lopt->clock_hand;
511 do {
512 reqp=&lopt->syn_table[i];
513 while ((req = *reqp) != NULL) {
514 if (time_after_eq(now, req->expires)) {
515 if ((req->retrans < thresh ||
516 (req->acked && req->retrans < max_retries))
517 && !req->class->rtx_syn_ack(sk, req, NULL)) {
518 unsigned long timeo;
520 if (req->retrans++ == 0)
521 lopt->qlen_young--;
522 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
523 TCP_RTO_MAX);
524 req->expires = now + timeo;
525 reqp = &req->dl_next;
526 continue;
529 /* Drop this request */
530 write_lock(&tp->syn_wait_lock);
531 *reqp = req->dl_next;
532 write_unlock(&tp->syn_wait_lock);
533 lopt->qlen--;
534 if (req->retrans == 0)
535 lopt->qlen_young--;
536 tcp_openreq_free(req);
537 continue;
539 reqp = &req->dl_next;
542 i = (i+1)&(TCP_SYNQ_HSIZE-1);
544 } while (--budget > 0);
546 lopt->clock_hand = i;
548 if (lopt->qlen)
549 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
552 void tcp_delete_keepalive_timer (struct sock *sk)
554 sk_stop_timer(sk, &sk->sk_timer);
557 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
559 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
562 void tcp_set_keepalive(struct sock *sk, int val)
564 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
565 return;
567 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
568 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
569 else if (!val)
570 tcp_delete_keepalive_timer(sk);
574 static void tcp_keepalive_timer (unsigned long data)
576 struct sock *sk = (struct sock *) data;
577 struct tcp_sock *tp = tcp_sk(sk);
578 __u32 elapsed;
580 /* Only process if socket is not in use. */
581 bh_lock_sock(sk);
582 if (sock_owned_by_user(sk)) {
583 /* Try again later. */
584 tcp_reset_keepalive_timer (sk, HZ/20);
585 goto out;
588 if (sk->sk_state == TCP_LISTEN) {
589 tcp_synack_timer(sk);
590 goto out;
593 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
594 if (tp->linger2 >= 0) {
595 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
597 if (tmo > 0) {
598 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
599 goto out;
602 tcp_send_active_reset(sk, GFP_ATOMIC);
603 goto death;
606 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
607 goto out;
609 elapsed = keepalive_time_when(tp);
611 /* It is alive without keepalive 8) */
612 if (tp->packets_out || sk->sk_send_head)
613 goto resched;
615 elapsed = tcp_time_stamp - tp->rcv_tstamp;
617 if (elapsed >= keepalive_time_when(tp)) {
618 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
619 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
620 tcp_send_active_reset(sk, GFP_ATOMIC);
621 tcp_write_err(sk);
622 goto out;
624 if (tcp_write_wakeup(sk) <= 0) {
625 tp->probes_out++;
626 elapsed = keepalive_intvl_when(tp);
627 } else {
628 /* If keepalive was lost due to local congestion,
629 * try harder.
631 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
633 } else {
634 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
635 elapsed = keepalive_time_when(tp) - elapsed;
638 TCP_CHECK_TIMER(sk);
639 sk_stream_mem_reclaim(sk);
641 resched:
642 tcp_reset_keepalive_timer (sk, elapsed);
643 goto out;
645 death:
646 tcp_done(sk);
648 out:
649 bh_unlock_sock(sk);
650 sock_put(sk);
653 EXPORT_SYMBOL(tcp_clear_xmit_timers);
654 EXPORT_SYMBOL(tcp_delete_keepalive_timer);
655 EXPORT_SYMBOL(tcp_init_xmit_timers);
656 EXPORT_SYMBOL(tcp_reset_keepalive_timer);