2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.80 2000/10/03 07:29:01 anton Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
25 int sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
26 int sysctl_tcp_synack_retries
= TCP_SYNACK_RETRIES
;
27 int sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
28 int sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
29 int sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
30 int sysctl_tcp_retries1
= TCP_RETR1
;
31 int sysctl_tcp_retries2
= TCP_RETR2
;
32 int sysctl_tcp_orphan_retries
;
34 static void tcp_write_timer(unsigned long);
35 static void tcp_delack_timer(unsigned long);
36 static void tcp_keepalive_timer (unsigned long data
);
38 const char timer_bug_msg
[] = KERN_DEBUG
"tcpbug: unknown timer value\n";
41 * Using different timers for retransmit, delayed acks and probes
42 * We may wish use just one timer maintaining a list of expire jiffies
46 void tcp_init_xmit_timers(struct sock
*sk
)
48 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
50 init_timer(&tp
->retransmit_timer
);
51 tp
->retransmit_timer
.function
=&tcp_write_timer
;
52 tp
->retransmit_timer
.data
= (unsigned long) sk
;
55 init_timer(&tp
->delack_timer
);
56 tp
->delack_timer
.function
=&tcp_delack_timer
;
57 tp
->delack_timer
.data
= (unsigned long) sk
;
60 init_timer(&sk
->timer
);
61 sk
->timer
.function
=&tcp_keepalive_timer
;
62 sk
->timer
.data
= (unsigned long) sk
;
65 void tcp_clear_xmit_timers(struct sock
*sk
)
67 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
70 if (timer_pending(&tp
->retransmit_timer
) &&
71 del_timer(&tp
->retransmit_timer
))
76 if (timer_pending(&tp
->delack_timer
) &&
77 del_timer(&tp
->delack_timer
))
80 if(timer_pending(&sk
->timer
) && del_timer(&sk
->timer
))
84 static void tcp_write_err(struct sock
*sk
)
86 sk
->err
= sk
->err_soft
? : ETIMEDOUT
;
90 NET_INC_STATS_BH(TCPAbortOnTimeout
);
93 /* Do not allow orphaned sockets to eat all our resources.
94 * This is direct violation of TCP specs, but it is required
95 * to prevent DoS attacks. It is called when a retransmission timeout
96 * or zero probe timeout occurs on orphaned socket.
98 * Criterium is still not confirmed experimentally and may change.
99 * We kill the socket, if:
100 * 1. If number of orphaned sockets exceeds an administratively configured
102 * 2. If we have strong memory pressure.
104 static int tcp_out_of_resources(struct sock
*sk
, int do_reset
)
106 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
107 int orphans
= atomic_read(&tcp_orphan_count
);
109 /* If peer does not open window for long time, or did not transmit
110 * anything for long time, penalize it. */
111 if ((s32
)(tcp_time_stamp
- tp
->lsndtime
) > 2*TCP_RTO_MAX
|| !do_reset
)
114 /* If some dubious ICMP arrived, penalize even more. */
118 if (orphans
>= sysctl_tcp_max_orphans
||
119 (sk
->wmem_queued
> SOCK_MIN_SNDBUF
&&
120 atomic_read(&tcp_memory_allocated
) > sysctl_tcp_mem
[2])) {
122 printk(KERN_INFO
"Out of socket memory\n");
124 /* Catch exceptional cases, when connection requires reset.
125 * 1. Last segment was sent recently. */
126 if ((s32
)(tcp_time_stamp
- tp
->lsndtime
) <= TCP_TIMEWAIT_LEN
||
127 /* 2. Window is closed. */
128 (!tp
->snd_wnd
&& !tp
->packets_out
))
131 tcp_send_active_reset(sk
, GFP_ATOMIC
);
133 NET_INC_STATS_BH(TCPAbortOnMemory
);
139 /* Calculate maximal number or retries on an orphaned socket. */
140 static int tcp_orphan_retries(struct sock
*sk
, int alive
)
142 int retries
= sysctl_tcp_orphan_retries
; /* May be zero. */
144 /* We know from an ICMP that something is wrong. */
145 if (sk
->err_soft
&& !alive
)
148 /* However, if socket sent something recently, select some safe
149 * number of retries. 8 corresponds to >100 seconds with minimal
151 if (retries
== 0 && alive
)
156 /* A write timeout has occurred. Process the after effects. */
157 static int tcp_write_timeout(struct sock
*sk
)
159 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
162 if ((1<<sk
->state
)&(TCPF_SYN_SENT
|TCPF_SYN_RECV
)) {
164 dst_negative_advice(&sk
->dst_cache
);
165 retry_until
= tp
->syn_retries
? : sysctl_tcp_syn_retries
;
167 if (tp
->retransmits
>= sysctl_tcp_retries1
) {
168 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
171 It is place to make it. It is not made. I do not want
172 to make it. It is disguisting. It does not work in any
173 case. Let me to cite the same draft, which requires for
174 us to implement this:
176 "The one security concern raised by this memo is that ICMP black holes
177 are often caused by over-zealous security administrators who block
178 all ICMP messages. It is vitally important that those who design and
179 deploy security systems understand the impact of strict filtering on
180 upper-layer protocols. The safest web site in the world is worthless
181 if most TCP implementations cannot transfer data from it. It would
182 be far nicer to have all of the black holes fixed rather than fixing
183 all of the TCP implementations."
188 dst_negative_advice(&sk
->dst_cache
);
191 retry_until
= sysctl_tcp_retries2
;
193 int alive
= (tp
->rto
< TCP_RTO_MAX
);
195 retry_until
= tcp_orphan_retries(sk
, alive
);
197 if (tcp_out_of_resources(sk
, alive
|| tp
->retransmits
< retry_until
))
202 if (tp
->retransmits
>= retry_until
) {
203 /* Has it gone just too far? */
210 static void tcp_delack_timer(unsigned long data
)
212 struct sock
*sk
= (struct sock
*)data
;
213 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
216 if (sk
->lock
.users
) {
217 /* Try again later. */
219 NET_INC_STATS_BH(DelayedACKLocked
);
220 if (!mod_timer(&tp
->delack_timer
, jiffies
+ TCP_DELACK_MIN
))
227 if (sk
->state
== TCP_CLOSE
|| !(tp
->ack
.pending
&TCP_ACK_TIMER
))
230 if ((long)(tp
->ack
.timeout
- jiffies
) > 0) {
231 if (!mod_timer(&tp
->delack_timer
, tp
->ack
.timeout
))
235 tp
->ack
.pending
&= ~TCP_ACK_TIMER
;
237 if (skb_queue_len(&tp
->ucopy
.prequeue
)) {
240 net_statistics
[smp_processor_id()*2].TCPSchedulerFailed
+= skb_queue_len(&tp
->ucopy
.prequeue
);
242 while ((skb
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
)
243 sk
->backlog_rcv(sk
, skb
);
245 tp
->ucopy
.memory
= 0;
248 if (tcp_ack_scheduled(tp
)) {
249 if (!tp
->ack
.pingpong
) {
250 /* Delayed ACK missed: inflate ATO. */
251 tp
->ack
.ato
= min(tp
->ack
.ato
<<1, tp
->rto
);
253 /* Delayed ACK missed: leave pingpong mode and
256 tp
->ack
.pingpong
= 0;
257 tp
->ack
.ato
= TCP_ATO_MIN
;
260 NET_INC_STATS_BH(DelayedACKs
);
265 if (tcp_memory_pressure
)
272 static void tcp_probe_timer(struct sock
*sk
)
274 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
277 if (tp
->packets_out
|| !tp
->send_head
) {
282 /* *WARNING* RFC 1122 forbids this
284 * It doesn't AFAIK, because we kill the retransmit timer -AK
286 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
287 * this behaviour in Solaris down as a bug fix. [AC]
289 * Let me to explain. probes_out is zeroed by incoming ACKs
290 * even if they advertise zero window. Hence, connection is killed only
291 * if we received no ACKs for normal connection timeout. It is not killed
292 * only because window stays zero for some time, window may be zero
293 * until armageddon and even later. We are in full accordance
294 * with RFCs, only probe timer combines both retransmission timeout
295 * and probe timeout in one bottle. --ANK
297 max_probes
= sysctl_tcp_retries2
;
300 int alive
= ((tp
->rto
<<tp
->backoff
) < TCP_RTO_MAX
);
302 max_probes
= tcp_orphan_retries(sk
, alive
);
304 if (tcp_out_of_resources(sk
, alive
|| tp
->probes_out
<= max_probes
))
308 if (tp
->probes_out
> max_probes
) {
311 /* Only send another probe if we didn't close things up. */
317 * The TCP retransmit timer.
320 static void tcp_retransmit_timer(struct sock
*sk
)
322 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
324 if (tp
->packets_out
== 0)
327 BUG_TRAP(!skb_queue_empty(&sk
->write_queue
));
329 if (tcp_write_timeout(sk
))
332 if (tp
->retransmits
== 0) {
333 if (tp
->ca_state
== TCP_CA_Disorder
|| tp
->ca_state
== TCP_CA_Recovery
) {
335 if (tp
->ca_state
== TCP_CA_Recovery
)
336 NET_INC_STATS_BH(TCPSackRecoveryFail
);
338 NET_INC_STATS_BH(TCPSackFailures
);
340 if (tp
->ca_state
== TCP_CA_Recovery
)
341 NET_INC_STATS_BH(TCPRenoRecoveryFail
);
343 NET_INC_STATS_BH(TCPRenoFailures
);
345 } else if (tp
->ca_state
== TCP_CA_Loss
) {
346 NET_INC_STATS_BH(TCPLossFailures
);
348 NET_INC_STATS_BH(TCPTimeouts
);
352 tcp_enter_loss(sk
, 0);
354 if (tcp_retransmit_skb(sk
, skb_peek(&sk
->write_queue
)) > 0) {
355 /* Retransmission failed because of local congestion,
358 if (!tp
->retransmits
)
360 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
,
361 min(tp
->rto
, TCP_RESOURCE_PROBE_INTERVAL
));
365 /* Increase the timeout each time we retransmit. Note that
366 * we do not increase the rtt estimate. rto is initialized
367 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
368 * that doubling rto each time is the least we can get away with.
369 * In KA9Q, Karn uses this for the first few times, and then
370 * goes to quadratic. netBSD doubles, but only goes up to *64,
371 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
372 * defined in the protocol as the maximum possible RTT. I guess
373 * we'll have to use something other than TCP to talk to the
374 * University of Mars.
376 * PAWS allows us longer timeouts and large windows, so once
377 * implemented ftp to mars will work nicely. We will have to fix
378 * the 120 second clamps though!
382 tp
->rto
= min(tp
->rto
<< 1, TCP_RTO_MAX
);
383 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
384 if (tp
->retransmits
> sysctl_tcp_retries1
)
390 static void tcp_write_timer(unsigned long data
)
392 struct sock
*sk
= (struct sock
*)data
;
393 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
397 if (sk
->lock
.users
) {
398 /* Try again later */
399 if (!mod_timer(&tp
->retransmit_timer
, jiffies
+ (HZ
/20)))
404 if (sk
->state
== TCP_CLOSE
|| !tp
->pending
)
407 if ((long)(tp
->timeout
- jiffies
) > 0) {
408 if (!mod_timer(&tp
->retransmit_timer
, tp
->timeout
))
417 case TCP_TIME_RETRANS
:
418 tcp_retransmit_timer(sk
);
420 case TCP_TIME_PROBE0
:
434 * Timer for listening sockets
437 static void tcp_synack_timer(struct sock
*sk
)
439 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
440 struct tcp_listen_opt
*lopt
= tp
->listen_opt
;
441 int max_retries
= tp
->syn_retries
? : sysctl_tcp_synack_retries
;
442 int thresh
= max_retries
;
443 unsigned long now
= jiffies
;
444 struct open_request
**reqp
, *req
;
447 if (lopt
== NULL
|| lopt
->qlen
== 0)
450 /* Normally all the openreqs are young and become mature
451 * (i.e. converted to established socket) for first timeout.
452 * If synack was not acknowledged for 3 seconds, it means
453 * one of the following things: synack was lost, ack was lost,
454 * rtt is high or nobody planned to ack (i.e. synflood).
455 * When server is a bit loaded, queue is populated with old
456 * open requests, reducing effective size of queue.
457 * When server is well loaded, queue size reduces to zero
458 * after several minutes of work. It is not synflood,
459 * it is normal operation. The solution is pruning
460 * too old entries overriding normal timeout, when
461 * situation becomes dangerous.
463 * Essentially, we reserve half of room for young
464 * embrions; and abort old ones without pity, if old
465 * ones are about to clog our table.
467 if (lopt
->qlen
>>(lopt
->max_qlen_log
-1)) {
468 int young
= (lopt
->qlen_young
<<1);
471 if (lopt
->qlen
< young
)
478 if (tp
->defer_accept
)
479 max_retries
= tp
->defer_accept
;
481 budget
= 2*(TCP_SYNQ_HSIZE
/(TCP_TIMEOUT_INIT
/TCP_SYNQ_INTERVAL
));
482 i
= lopt
->clock_hand
;
485 reqp
=&lopt
->syn_table
[i
];
486 while ((req
= *reqp
) != NULL
) {
487 if ((long)(now
- req
->expires
) >= 0) {
488 if ((req
->retrans
< thresh
||
489 (req
->acked
&& req
->retrans
< max_retries
))
490 && !req
->class->rtx_syn_ack(sk
, req
, NULL
)) {
493 if (req
->retrans
++ == 0)
495 timeo
= min((TCP_TIMEOUT_INIT
<< req
->retrans
),
497 req
->expires
= now
+ timeo
;
498 reqp
= &req
->dl_next
;
502 /* Drop this request */
503 write_lock(&tp
->syn_wait_lock
);
504 *reqp
= req
->dl_next
;
505 write_unlock(&tp
->syn_wait_lock
);
507 if (req
->retrans
== 0)
509 tcp_openreq_free(req
);
512 reqp
= &req
->dl_next
;
515 i
= (i
+1)&(TCP_SYNQ_HSIZE
-1);
517 } while (--budget
> 0);
519 lopt
->clock_hand
= i
;
522 tcp_reset_keepalive_timer(sk
, TCP_SYNQ_INTERVAL
);
525 void tcp_delete_keepalive_timer (struct sock
*sk
)
527 if (timer_pending(&sk
->timer
) && del_timer (&sk
->timer
))
531 void tcp_reset_keepalive_timer (struct sock
*sk
, unsigned long len
)
533 if (!mod_timer(&sk
->timer
, jiffies
+len
))
537 void tcp_set_keepalive(struct sock
*sk
, int val
)
539 if ((1<<sk
->state
)&(TCPF_CLOSE
|TCPF_LISTEN
))
542 if (val
&& !sk
->keepopen
)
543 tcp_reset_keepalive_timer(sk
, keepalive_time_when(&sk
->tp_pinfo
.af_tcp
));
545 tcp_delete_keepalive_timer(sk
);
549 static void tcp_keepalive_timer (unsigned long data
)
551 struct sock
*sk
= (struct sock
*) data
;
552 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
555 /* Only process if socket is not in use. */
557 if (sk
->lock
.users
) {
558 /* Try again later. */
559 tcp_reset_keepalive_timer (sk
, HZ
/20);
563 if (sk
->state
== TCP_LISTEN
) {
564 tcp_synack_timer(sk
);
568 if (sk
->state
== TCP_FIN_WAIT2
&& sk
->dead
) {
569 if (tp
->linger2
>= 0) {
570 int tmo
= tcp_fin_time(tp
) - TCP_TIMEWAIT_LEN
;
573 tcp_time_wait(sk
, TCP_FIN_WAIT2
, tmo
);
577 tcp_send_active_reset(sk
, GFP_ATOMIC
);
581 if (!sk
->keepopen
|| sk
->state
== TCP_CLOSE
)
584 elapsed
= keepalive_time_when(tp
);
586 /* It is alive without keepalive 8) */
587 if (tp
->packets_out
|| tp
->send_head
)
590 elapsed
= tcp_time_stamp
- tp
->rcv_tstamp
;
592 if (elapsed
>= keepalive_time_when(tp
)) {
593 if ((!tp
->keepalive_probes
&& tp
->probes_out
>= sysctl_tcp_keepalive_probes
) ||
594 (tp
->keepalive_probes
&& tp
->probes_out
>= tp
->keepalive_probes
)) {
595 tcp_send_active_reset(sk
, GFP_ATOMIC
);
599 if (tcp_write_wakeup(sk
) <= 0) {
601 elapsed
= keepalive_intvl_when(tp
);
603 /* If keepalive was lost due to local congestion,
606 elapsed
= TCP_RESOURCE_PROBE_INTERVAL
;
609 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
610 elapsed
= keepalive_time_when(tp
) - elapsed
;
617 tcp_reset_keepalive_timer (sk
, elapsed
);