Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / ipv4 / tcp_timer.c
blobf57aeb32e71338fe2500e0116feac58c5d91996c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.80 2000/10/03 07:29:01 anton Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <net/tcp.h>
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
27 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
28 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
29 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
30 int sysctl_tcp_retries1 = TCP_RETR1;
31 int sysctl_tcp_retries2 = TCP_RETR2;
32 int sysctl_tcp_orphan_retries;
34 static void tcp_write_timer(unsigned long);
35 static void tcp_delack_timer(unsigned long);
36 static void tcp_keepalive_timer (unsigned long data);
38 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41 * Using different timers for retransmit, delayed acks and probes
42 * We may wish use just one timer maintaining a list of expire jiffies
43 * to optimize.
46 void tcp_init_xmit_timers(struct sock *sk)
48 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
50 init_timer(&tp->retransmit_timer);
51 tp->retransmit_timer.function=&tcp_write_timer;
52 tp->retransmit_timer.data = (unsigned long) sk;
53 tp->pending = 0;
55 init_timer(&tp->delack_timer);
56 tp->delack_timer.function=&tcp_delack_timer;
57 tp->delack_timer.data = (unsigned long) sk;
58 tp->ack.pending = 0;
60 init_timer(&sk->timer);
61 sk->timer.function=&tcp_keepalive_timer;
62 sk->timer.data = (unsigned long) sk;
65 void tcp_clear_xmit_timers(struct sock *sk)
67 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
69 tp->pending = 0;
70 if (timer_pending(&tp->retransmit_timer) &&
71 del_timer(&tp->retransmit_timer))
72 __sock_put(sk);
74 tp->ack.pending = 0;
75 tp->ack.blocked = 0;
76 if (timer_pending(&tp->delack_timer) &&
77 del_timer(&tp->delack_timer))
78 __sock_put(sk);
80 if(timer_pending(&sk->timer) && del_timer(&sk->timer))
81 __sock_put(sk);
84 static void tcp_write_err(struct sock *sk)
86 sk->err = sk->err_soft ? : ETIMEDOUT;
87 sk->error_report(sk);
89 tcp_done(sk);
90 NET_INC_STATS_BH(TCPAbortOnTimeout);
93 /* Do not allow orphaned sockets to eat all our resources.
94 * This is direct violation of TCP specs, but it is required
95 * to prevent DoS attacks. It is called when a retransmission timeout
96 * or zero probe timeout occurs on orphaned socket.
98 * Criterium is still not confirmed experimentally and may change.
99 * We kill the socket, if:
100 * 1. If number of orphaned sockets exceeds an administratively configured
101 * limit.
102 * 2. If we have strong memory pressure.
104 static int tcp_out_of_resources(struct sock *sk, int do_reset)
106 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
107 int orphans = atomic_read(&tcp_orphan_count);
109 /* If peer does not open window for long time, or did not transmit
110 * anything for long time, penalize it. */
111 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
112 orphans <<= 1;
114 /* If some dubious ICMP arrived, penalize even more. */
115 if (sk->err_soft)
116 orphans <<= 1;
118 if (orphans >= sysctl_tcp_max_orphans ||
119 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
120 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
121 if (net_ratelimit())
122 printk(KERN_INFO "Out of socket memory\n");
124 /* Catch exceptional cases, when connection requires reset.
125 * 1. Last segment was sent recently. */
126 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
127 /* 2. Window is closed. */
128 (!tp->snd_wnd && !tp->packets_out))
129 do_reset = 1;
130 if (do_reset)
131 tcp_send_active_reset(sk, GFP_ATOMIC);
132 tcp_done(sk);
133 NET_INC_STATS_BH(TCPAbortOnMemory);
134 return 1;
136 return 0;
139 /* Calculate maximal number or retries on an orphaned socket. */
140 static int tcp_orphan_retries(struct sock *sk, int alive)
142 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
144 /* We know from an ICMP that something is wrong. */
145 if (sk->err_soft && !alive)
146 retries = 0;
148 /* However, if socket sent something recently, select some safe
149 * number of retries. 8 corresponds to >100 seconds with minimal
150 * RTO of 200msec. */
151 if (retries == 0 && alive)
152 retries = 8;
153 return retries;
156 /* A write timeout has occurred. Process the after effects. */
157 static int tcp_write_timeout(struct sock *sk)
159 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
160 int retry_until;
162 if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
163 if (tp->retransmits)
164 dst_negative_advice(&sk->dst_cache);
165 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
166 } else {
167 if (tp->retransmits >= sysctl_tcp_retries1) {
168 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
169 hole detection. :-(
171 It is place to make it. It is not made. I do not want
172 to make it. It is disguisting. It does not work in any
173 case. Let me to cite the same draft, which requires for
174 us to implement this:
176 "The one security concern raised by this memo is that ICMP black holes
177 are often caused by over-zealous security administrators who block
178 all ICMP messages. It is vitally important that those who design and
179 deploy security systems understand the impact of strict filtering on
180 upper-layer protocols. The safest web site in the world is worthless
181 if most TCP implementations cannot transfer data from it. It would
182 be far nicer to have all of the black holes fixed rather than fixing
183 all of the TCP implementations."
185 Golden words :-).
188 dst_negative_advice(&sk->dst_cache);
191 retry_until = sysctl_tcp_retries2;
192 if (sk->dead) {
193 int alive = (tp->rto < TCP_RTO_MAX);
195 retry_until = tcp_orphan_retries(sk, alive);
197 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
198 return 1;
202 if (tp->retransmits >= retry_until) {
203 /* Has it gone just too far? */
204 tcp_write_err(sk);
205 return 1;
207 return 0;
210 static void tcp_delack_timer(unsigned long data)
212 struct sock *sk = (struct sock*)data;
213 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
215 bh_lock_sock(sk);
216 if (sk->lock.users) {
217 /* Try again later. */
218 tp->ack.blocked = 1;
219 NET_INC_STATS_BH(DelayedACKLocked);
220 if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN))
221 sock_hold(sk);
222 goto out_unlock;
225 tcp_mem_reclaim(sk);
227 if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER))
228 goto out;
230 if ((long)(tp->ack.timeout - jiffies) > 0) {
231 if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
232 sock_hold(sk);
233 goto out;
235 tp->ack.pending &= ~TCP_ACK_TIMER;
237 if (skb_queue_len(&tp->ucopy.prequeue)) {
238 struct sk_buff *skb;
240 net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue);
242 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
243 sk->backlog_rcv(sk, skb);
245 tp->ucopy.memory = 0;
248 if (tcp_ack_scheduled(tp)) {
249 if (!tp->ack.pingpong) {
250 /* Delayed ACK missed: inflate ATO. */
251 tp->ack.ato = min(tp->ack.ato<<1, tp->rto);
252 } else {
253 /* Delayed ACK missed: leave pingpong mode and
254 * deflate ATO.
256 tp->ack.pingpong = 0;
257 tp->ack.ato = TCP_ATO_MIN;
259 tcp_send_ack(sk);
260 NET_INC_STATS_BH(DelayedACKs);
262 TCP_CHECK_TIMER(sk);
264 out:
265 if (tcp_memory_pressure)
266 tcp_mem_reclaim(sk);
267 out_unlock:
268 bh_unlock_sock(sk);
269 sock_put(sk);
272 static void tcp_probe_timer(struct sock *sk)
274 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
275 int max_probes;
277 if (tp->packets_out || !tp->send_head) {
278 tp->probes_out = 0;
279 return;
282 /* *WARNING* RFC 1122 forbids this
284 * It doesn't AFAIK, because we kill the retransmit timer -AK
286 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
287 * this behaviour in Solaris down as a bug fix. [AC]
289 * Let me to explain. probes_out is zeroed by incoming ACKs
290 * even if they advertise zero window. Hence, connection is killed only
291 * if we received no ACKs for normal connection timeout. It is not killed
292 * only because window stays zero for some time, window may be zero
293 * until armageddon and even later. We are in full accordance
294 * with RFCs, only probe timer combines both retransmission timeout
295 * and probe timeout in one bottle. --ANK
297 max_probes = sysctl_tcp_retries2;
299 if (sk->dead) {
300 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
302 max_probes = tcp_orphan_retries(sk, alive);
304 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
305 return;
308 if (tp->probes_out > max_probes) {
309 tcp_write_err(sk);
310 } else {
311 /* Only send another probe if we didn't close things up. */
312 tcp_send_probe0(sk);
317 * The TCP retransmit timer.
320 static void tcp_retransmit_timer(struct sock *sk)
322 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
324 if (tp->packets_out == 0)
325 goto out;
327 BUG_TRAP(!skb_queue_empty(&sk->write_queue));
329 if (tcp_write_timeout(sk))
330 goto out;
332 if (tp->retransmits == 0) {
333 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
334 if (tp->sack_ok) {
335 if (tp->ca_state == TCP_CA_Recovery)
336 NET_INC_STATS_BH(TCPSackRecoveryFail);
337 else
338 NET_INC_STATS_BH(TCPSackFailures);
339 } else {
340 if (tp->ca_state == TCP_CA_Recovery)
341 NET_INC_STATS_BH(TCPRenoRecoveryFail);
342 else
343 NET_INC_STATS_BH(TCPRenoFailures);
345 } else if (tp->ca_state == TCP_CA_Loss) {
346 NET_INC_STATS_BH(TCPLossFailures);
347 } else {
348 NET_INC_STATS_BH(TCPTimeouts);
352 tcp_enter_loss(sk, 0);
354 if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
355 /* Retransmission failed because of local congestion,
356 * do not backoff.
358 if (!tp->retransmits)
359 tp->retransmits=1;
360 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
361 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
362 goto out;
365 /* Increase the timeout each time we retransmit. Note that
366 * we do not increase the rtt estimate. rto is initialized
367 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
368 * that doubling rto each time is the least we can get away with.
369 * In KA9Q, Karn uses this for the first few times, and then
370 * goes to quadratic. netBSD doubles, but only goes up to *64,
371 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
372 * defined in the protocol as the maximum possible RTT. I guess
373 * we'll have to use something other than TCP to talk to the
374 * University of Mars.
376 * PAWS allows us longer timeouts and large windows, so once
377 * implemented ftp to mars will work nicely. We will have to fix
378 * the 120 second clamps though!
380 tp->backoff++;
381 tp->retransmits++;
382 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
383 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
384 if (tp->retransmits > sysctl_tcp_retries1)
385 __sk_dst_reset(sk);
387 out:
390 static void tcp_write_timer(unsigned long data)
392 struct sock *sk = (struct sock*)data;
393 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
394 int event;
396 bh_lock_sock(sk);
397 if (sk->lock.users) {
398 /* Try again later */
399 if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20)))
400 sock_hold(sk);
401 goto out_unlock;
404 if (sk->state == TCP_CLOSE || !tp->pending)
405 goto out;
407 if ((long)(tp->timeout - jiffies) > 0) {
408 if (!mod_timer(&tp->retransmit_timer, tp->timeout))
409 sock_hold(sk);
410 goto out;
413 event = tp->pending;
414 tp->pending = 0;
416 switch (event) {
417 case TCP_TIME_RETRANS:
418 tcp_retransmit_timer(sk);
419 break;
420 case TCP_TIME_PROBE0:
421 tcp_probe_timer(sk);
422 break;
424 TCP_CHECK_TIMER(sk);
426 out:
427 tcp_mem_reclaim(sk);
428 out_unlock:
429 bh_unlock_sock(sk);
430 sock_put(sk);
434 * Timer for listening sockets
437 static void tcp_synack_timer(struct sock *sk)
439 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
440 struct tcp_listen_opt *lopt = tp->listen_opt;
441 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
442 int thresh = max_retries;
443 unsigned long now = jiffies;
444 struct open_request **reqp, *req;
445 int i, budget;
447 if (lopt == NULL || lopt->qlen == 0)
448 return;
450 /* Normally all the openreqs are young and become mature
451 * (i.e. converted to established socket) for first timeout.
452 * If synack was not acknowledged for 3 seconds, it means
453 * one of the following things: synack was lost, ack was lost,
454 * rtt is high or nobody planned to ack (i.e. synflood).
455 * When server is a bit loaded, queue is populated with old
456 * open requests, reducing effective size of queue.
457 * When server is well loaded, queue size reduces to zero
458 * after several minutes of work. It is not synflood,
459 * it is normal operation. The solution is pruning
460 * too old entries overriding normal timeout, when
461 * situation becomes dangerous.
463 * Essentially, we reserve half of room for young
464 * embrions; and abort old ones without pity, if old
465 * ones are about to clog our table.
467 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
468 int young = (lopt->qlen_young<<1);
470 while (thresh > 2) {
471 if (lopt->qlen < young)
472 break;
473 thresh--;
474 young <<= 1;
478 if (tp->defer_accept)
479 max_retries = tp->defer_accept;
481 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
482 i = lopt->clock_hand;
484 do {
485 reqp=&lopt->syn_table[i];
486 while ((req = *reqp) != NULL) {
487 if ((long)(now - req->expires) >= 0) {
488 if ((req->retrans < thresh ||
489 (req->acked && req->retrans < max_retries))
490 && !req->class->rtx_syn_ack(sk, req, NULL)) {
491 unsigned long timeo;
493 if (req->retrans++ == 0)
494 lopt->qlen_young--;
495 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
496 TCP_RTO_MAX);
497 req->expires = now + timeo;
498 reqp = &req->dl_next;
499 continue;
502 /* Drop this request */
503 write_lock(&tp->syn_wait_lock);
504 *reqp = req->dl_next;
505 write_unlock(&tp->syn_wait_lock);
506 lopt->qlen--;
507 if (req->retrans == 0)
508 lopt->qlen_young--;
509 tcp_openreq_free(req);
510 continue;
512 reqp = &req->dl_next;
515 i = (i+1)&(TCP_SYNQ_HSIZE-1);
517 } while (--budget > 0);
519 lopt->clock_hand = i;
521 if (lopt->qlen)
522 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
525 void tcp_delete_keepalive_timer (struct sock *sk)
527 if (timer_pending(&sk->timer) && del_timer (&sk->timer))
528 __sock_put(sk);
531 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
533 if (!mod_timer(&sk->timer, jiffies+len))
534 sock_hold(sk);
537 void tcp_set_keepalive(struct sock *sk, int val)
539 if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
540 return;
542 if (val && !sk->keepopen)
543 tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
544 else if (!val)
545 tcp_delete_keepalive_timer(sk);
549 static void tcp_keepalive_timer (unsigned long data)
551 struct sock *sk = (struct sock *) data;
552 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
553 __u32 elapsed;
555 /* Only process if socket is not in use. */
556 bh_lock_sock(sk);
557 if (sk->lock.users) {
558 /* Try again later. */
559 tcp_reset_keepalive_timer (sk, HZ/20);
560 goto out;
563 if (sk->state == TCP_LISTEN) {
564 tcp_synack_timer(sk);
565 goto out;
568 if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
569 if (tp->linger2 >= 0) {
570 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
572 if (tmo > 0) {
573 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
574 goto out;
577 tcp_send_active_reset(sk, GFP_ATOMIC);
578 goto death;
581 if (!sk->keepopen || sk->state == TCP_CLOSE)
582 goto out;
584 elapsed = keepalive_time_when(tp);
586 /* It is alive without keepalive 8) */
587 if (tp->packets_out || tp->send_head)
588 goto resched;
590 elapsed = tcp_time_stamp - tp->rcv_tstamp;
592 if (elapsed >= keepalive_time_when(tp)) {
593 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
594 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
595 tcp_send_active_reset(sk, GFP_ATOMIC);
596 tcp_write_err(sk);
597 goto out;
599 if (tcp_write_wakeup(sk) <= 0) {
600 tp->probes_out++;
601 elapsed = keepalive_intvl_when(tp);
602 } else {
603 /* If keepalive was lost due to local congestion,
604 * try harder.
606 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
608 } else {
609 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
610 elapsed = keepalive_time_when(tp) - elapsed;
613 TCP_CHECK_TIMER(sk);
614 tcp_mem_reclaim(sk);
616 resched:
617 tcp_reset_keepalive_timer (sk, elapsed);
618 goto out;
620 death:
621 tcp_done(sk);
623 out:
624 bh_unlock_sock(sk);
625 sock_put(sk);