Import 2.3.41pre2
[davej-history.git] / net / ipv4 / tcp_timer.c
blobbff4e872fce09cd189eeb04317789d58b9320e62
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <net/tcp.h>
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
27 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
28 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
29 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
30 int sysctl_tcp_retries1 = TCP_RETR1;
31 int sysctl_tcp_retries2 = TCP_RETR2;
32 int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES;
34 static void tcp_retransmit_timer(unsigned long);
35 static void tcp_delack_timer(unsigned long);
36 static void tcp_probe_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
38 static void tcp_twkill(unsigned long);
40 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
43 * Using different timers for retransmit, delayed acks and probes
44 * We may wish use just one timer maintaining a list of expire jiffies
45 * to optimize.
48 void tcp_init_xmit_timers(struct sock *sk)
50 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
52 spin_lock_init(&sk->timer_lock);
54 init_timer(&tp->retransmit_timer);
55 tp->retransmit_timer.function=&tcp_retransmit_timer;
56 tp->retransmit_timer.data = (unsigned long) sk;
58 init_timer(&tp->delack_timer);
59 tp->delack_timer.function=&tcp_delack_timer;
60 tp->delack_timer.data = (unsigned long) sk;
62 init_timer(&tp->probe_timer);
63 tp->probe_timer.function=&tcp_probe_timer;
64 tp->probe_timer.data = (unsigned long) sk;
66 init_timer(&sk->timer);
67 sk->timer.function=&tcp_keepalive_timer;
68 sk->timer.data = (unsigned long) sk;
72 * Reset the retransmission timer
75 void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
77 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
79 spin_lock_bh(&sk->timer_lock);
80 switch (what) {
81 case TCP_TIME_RETRANS:
82 /* When seting the transmit timer the probe timer
83 * should not be set.
84 * The delayed ack timer can be set if we are changing the
85 * retransmit timer when removing acked frames.
87 if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
88 __sock_put(sk);
89 if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
90 sock_hold(sk);
91 if (when > TCP_RTO_MAX) {
92 printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
93 when = TCP_RTO_MAX;
95 mod_timer(&tp->retransmit_timer, jiffies+when);
96 break;
98 case TCP_TIME_DACK:
99 if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
100 sock_hold(sk);
101 mod_timer(&tp->delack_timer, jiffies+when);
102 break;
104 case TCP_TIME_PROBE0:
105 if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
106 sock_hold(sk);
107 mod_timer(&tp->probe_timer, jiffies+when);
108 break;
110 default:
111 printk(KERN_DEBUG "bug: unknown timer value\n");
113 spin_unlock_bh(&sk->timer_lock);
116 void tcp_clear_xmit_timers(struct sock *sk)
118 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
120 spin_lock_bh(&sk->timer_lock);
121 if(tp->retransmit_timer.prev && del_timer(&tp->retransmit_timer))
122 __sock_put(sk);
123 if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
124 __sock_put(sk);
125 tp->ack.blocked = 0;
126 if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
127 __sock_put(sk);
128 if(sk->timer.prev && del_timer(&sk->timer))
129 __sock_put(sk);
130 spin_unlock_bh(&sk->timer_lock);
133 static void tcp_write_err(struct sock *sk)
135 sk->err = sk->err_soft ? : ETIMEDOUT;
136 sk->error_report(sk);
138 tcp_done(sk);
141 /* A write timeout has occurred. Process the after effects. */
142 static int tcp_write_timeout(struct sock *sk)
144 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
145 int retry_until;
147 if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
148 if (tp->retransmits)
149 dst_negative_advice(&sk->dst_cache);
150 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
151 } else {
152 if (tp->retransmits >= sysctl_tcp_retries1) {
153 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
154 hole detection. :-(
156 It is place to make it. It is not made. I do not want
157 to make it. It is disguisting. It does not work in any
158 case. Let me to cite the same draft, which requires for
159 us to implement this:
161 "The one security concern raised by this memo is that ICMP black holes
162 are often caused by over-zealous security administrators who block
163 all ICMP messages. It is vitally important that those who design and
164 deploy security systems understand the impact of strict filtering on
165 upper-layer protocols. The safest web site in the world is worthless
166 if most TCP implementations cannot transfer data from it. It would
167 be far nicer to have all of the black holes fixed rather than fixing
168 all of the TCP implementations."
170 Golden words :-).
173 dst_negative_advice(&sk->dst_cache);
175 retry_until = sysctl_tcp_retries2;
176 if (sk->dead)
177 retry_until = sysctl_tcp_orphan_retries;
180 if (tp->retransmits >= retry_until) {
181 /* Has it gone just too far? */
182 tcp_write_err(sk);
183 return 1;
185 return 0;
188 static void tcp_delack_timer(unsigned long data)
190 struct sock *sk = (struct sock*)data;
191 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
193 bh_lock_sock(sk);
194 if (sk->lock.users) {
195 /* Try again later. */
196 tp->ack.blocked = 1;
197 NET_INC_STATS_BH(DelayedACKLocked);
198 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
199 goto out_unlock;
202 if (tp->ack.pending) {
203 /* Delayed ACK missed: inflate ATO, leave pingpong mode */
204 tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX);
205 tp->ack.pingpong = 0;
206 tcp_send_ack(sk);
207 NET_INC_STATS_BH(DelayedACKs);
209 TCP_CHECK_TIMER(sk);
211 out_unlock:
212 bh_unlock_sock(sk);
213 sock_put(sk);
216 static void tcp_probe_timer(unsigned long data)
218 struct sock *sk = (struct sock*)data;
219 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
220 int max_probes;
222 bh_lock_sock(sk);
223 if (sk->lock.users) {
224 /* Try again later. */
225 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5);
226 goto out_unlock;
229 if (sk->state == TCP_CLOSE)
230 goto out_unlock;
232 if (tp->packets_out || !tp->send_head) {
233 tp->probes_out = 0;
234 goto out_unlock;
237 /* *WARNING* RFC 1122 forbids this
239 * It doesn't AFAIK, because we kill the retransmit timer -AK
241 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
242 * this behaviour in Solaris down as a bug fix. [AC]
244 * Let me to explain. probes_out is zeroed by incoming ACKs
245 * even if they advertise zero window. Hence, connection is killed only
246 * if we received no ACKs for normal connection timeout. It is not killed
247 * only because window stays zero for some time, window may be zero
248 * until armageddon and even later. We are in full accordance
249 * with RFCs, only probe timer combines both retransmission timeout
250 * and probe timeout in one bottle. --ANK
252 max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2;
254 if (tp->probes_out > max_probes) {
255 tcp_write_err(sk);
256 } else {
257 /* Only send another probe if we didn't close things up. */
258 tcp_send_probe0(sk);
259 TCP_CHECK_TIMER(sk);
261 out_unlock:
262 bh_unlock_sock(sk);
263 sock_put(sk);
267 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
268 static int tcp_tw_death_row_slot = 0;
269 int tcp_tw_count = 0;
271 static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
272 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
273 static struct timer_list tcp_tw_timer = { function: tcp_twkill };
275 static void tcp_twkill(unsigned long data)
277 struct tcp_tw_bucket *tw;
278 int killed = 0;
280 /* NOTE: compare this to previous version where lock
281 * was released after detaching chain. It was racy,
282 * because tw buckets are scheduled in not serialized context
283 * in 2.3 (with netfilter), and with softnet it is common, because
284 * soft irqs are not sequenced.
286 spin_lock(&tw_death_lock);
288 if (tcp_tw_count == 0)
289 goto out;
291 while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
292 tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
293 tw->pprev_death = NULL;
294 spin_unlock(&tw_death_lock);
296 tcp_timewait_kill(tw);
297 tcp_tw_put(tw);
299 killed++;
301 spin_lock(&tw_death_lock);
303 tcp_tw_death_row_slot =
304 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
306 if ((tcp_tw_count -= killed) != 0)
307 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
308 net_statistics[smp_processor_id()*2].TimeWaited += killed;
309 out:
310 spin_unlock(&tw_death_lock);
313 /* These are always called from BH context. See callers in
314 * tcp_input.c to verify this.
317 /* This is for handling early-kills of TIME_WAIT sockets. */
318 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
320 spin_lock(&tw_death_lock);
321 if (tw->pprev_death) {
322 if(tw->next_death)
323 tw->next_death->pprev_death = tw->pprev_death;
324 *tw->pprev_death = tw->next_death;
325 tw->pprev_death = NULL;
326 tcp_tw_put(tw);
327 if (--tcp_tw_count == 0)
328 del_timer(&tcp_tw_timer);
330 spin_unlock(&tw_death_lock);
333 /* Short-time timewait calendar */
335 static int tcp_twcal_hand = -1;
336 static int tcp_twcal_jiffie;
337 static void tcp_twcal_tick(unsigned long);
338 static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,};
339 static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
341 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
343 struct tcp_tw_bucket **tpp;
344 int slot;
346 /* timeout := RTO * 3.5
348 * 3.5 = 1+2+0.5 to wait for two retransmits.
350 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
351 * our ACK acking that FIN can be lost. If N subsequent retransmitted
352 * FINs (or previous seqments) are lost (probability of such event
353 * is p^(N+1), where p is probability to lose single packet and
354 * time to detect the loss is about RTO*(2^N - 1) with exponential
355 * backoff). Normal timewait length is calculated so, that we
356 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
357 * [ BTW Linux. following BSD, violates this requirement waiting
358 * only for 60sec, we should wait at least for 240 secs.
359 * Well, 240 consumes too much of resources 8)
361 * This interval is not reduced to catch old duplicate and
362 * responces to our wandering segments living for two MSLs.
363 * However, if we use PAWS to detect
364 * old duplicates, we can reduce the interval to bounds required
365 * by RTO, rather than MSL. So, if peer understands PAWS, we
366 * kill tw bucket after 3.5*RTO (it is important that this number
367 * is greater than TS tick!) and detect old duplicates with help
368 * of PAWS.
370 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
372 spin_lock(&tw_death_lock);
374 /* Unlink it, if it was scheduled */
375 if (tw->pprev_death) {
376 if(tw->next_death)
377 tw->next_death->pprev_death = tw->pprev_death;
378 *tw->pprev_death = tw->next_death;
379 tw->pprev_death = NULL;
380 tcp_tw_count--;
381 } else
382 atomic_inc(&tw->refcnt);
384 if (slot >= TCP_TW_RECYCLE_SLOTS) {
385 /* Schedule to slow timer */
386 if (timeo >= TCP_TIMEWAIT_LEN) {
387 slot = TCP_TWKILL_SLOTS-1;
388 } else {
389 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
390 if (slot >= TCP_TWKILL_SLOTS)
391 slot = TCP_TWKILL_SLOTS-1;
393 tw->ttd = jiffies + timeo;
394 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
395 tpp = &tcp_tw_death_row[slot];
396 } else {
397 tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
399 if (tcp_twcal_hand < 0) {
400 tcp_twcal_hand = 0;
401 tcp_twcal_jiffie = jiffies;
402 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
403 add_timer(&tcp_twcal_timer);
404 } else {
405 if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
406 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
407 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
409 tpp = &tcp_twcal_row[slot];
412 if((tw->next_death = *tpp) != NULL)
413 (*tpp)->pprev_death = &tw->next_death;
414 *tpp = tw;
415 tw->pprev_death = tpp;
417 if (tcp_tw_count++ == 0)
418 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
419 spin_unlock(&tw_death_lock);
422 void tcp_twcal_tick(unsigned long dummy)
424 int n, slot;
425 unsigned long j;
426 unsigned long now = jiffies;
427 int killed = 0;
428 int adv = 0;
430 spin_lock(&tw_death_lock);
431 if (tcp_twcal_hand < 0)
432 goto out;
434 slot = tcp_twcal_hand;
435 j = tcp_twcal_jiffie;
437 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
438 if ((long)(j - now) <= 0) {
439 struct tcp_tw_bucket *tw;
441 while((tw = tcp_twcal_row[slot]) != NULL) {
442 tcp_twcal_row[slot] = tw->next_death;
443 tw->pprev_death = NULL;
445 tcp_timewait_kill(tw);
446 tcp_tw_put(tw);
447 killed++;
449 } else {
450 if (!adv) {
451 adv = 1;
452 tcp_twcal_jiffie = j;
453 tcp_twcal_hand = slot;
456 if (tcp_twcal_row[slot] != NULL) {
457 mod_timer(&tcp_twcal_timer, j);
458 goto out;
461 j += (1<<TCP_TW_RECYCLE_TICK);
462 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
464 tcp_twcal_hand = -1;
466 out:
467 if ((tcp_tw_count -= killed) == 0)
468 del_timer(&tcp_tw_timer);
469 net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
470 spin_unlock(&tw_death_lock);
475 * The TCP retransmit timer.
478 static void tcp_retransmit_timer(unsigned long data)
480 struct sock *sk = (struct sock*)data;
481 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
483 bh_lock_sock(sk);
484 if (sk->lock.users) {
485 /* Try again later */
486 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20);
487 goto out_unlock;
490 if (sk->state == TCP_CLOSE || tp->packets_out == 0)
491 goto out_unlock;
493 BUG_TRAP(!skb_queue_empty(&sk->write_queue));
495 if (tcp_write_timeout(sk))
496 goto out_unlock;
498 /* RFC 2018, clear all 'sacked' flags in retransmission queue,
499 * the sender may have dropped out of order frames and we must
500 * send them out should this timer fire on us.
502 if(tp->sack_ok) {
503 struct sk_buff *skb = skb_peek(&sk->write_queue);
505 while((skb != NULL) &&
506 (skb != tp->send_head) &&
507 (skb != (struct sk_buff *)&sk->write_queue)) {
508 TCP_SKB_CB(skb)->sacked &=
509 ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
510 skb = skb->next;
514 /* Retransmission. */
515 tp->retrans_head = NULL;
516 tp->rexmt_done = 0;
517 tp->fackets_out = 0;
518 tp->retrans_out = 0;
519 if (tp->retransmits == 0) {
520 /* Remember window where we lost:
521 * "one half of the current window but at least 2 segments"
523 * Here "current window" means the effective one, which
524 * means it must be an accurate representation of our current
525 * sending rate _and_ the snd_wnd.
527 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
528 tp->snd_cwnd_cnt = 0;
529 tp->snd_cwnd = 1;
532 tp->dup_acks = 0;
533 tp->high_seq = tp->snd_nxt;
534 if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
535 /* Retransmission failed because of local congestion,
536 * do not backoff.
538 if (!tp->retransmits)
539 tp->retransmits=1;
540 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
541 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
542 TCP_CHECK_TIMER(sk);
543 goto out_unlock;
546 /* Increase the timeout each time we retransmit. Note that
547 * we do not increase the rtt estimate. rto is initialized
548 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
549 * that doubling rto each time is the least we can get away with.
550 * In KA9Q, Karn uses this for the first few times, and then
551 * goes to quadratic. netBSD doubles, but only goes up to *64,
552 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
553 * defined in the protocol as the maximum possible RTT. I guess
554 * we'll have to use something other than TCP to talk to the
555 * University of Mars.
557 * PAWS allows us longer timeouts and large windows, so once
558 * implemented ftp to mars will work nicely. We will have to fix
559 * the 120 second clamps though!
561 tp->backoff++;
562 tp->retransmits++;
563 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
564 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
565 TCP_CHECK_TIMER(sk);
567 out_unlock:
568 bh_unlock_sock(sk);
569 sock_put(sk);
573 * Timer for listening sockets
576 static void tcp_synack_timer(struct sock *sk)
578 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
579 struct tcp_listen_opt *lopt = tp->listen_opt;
580 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
581 int thresh = max_retries;
582 unsigned long now = jiffies;
583 struct open_request **reqp, *req;
584 int i, budget;
586 if (lopt == NULL || lopt->qlen == 0)
587 return;
589 /* Normally all the openreqs are young and become mature
590 * (i.e. converted to established socket) for first timeout.
591 * If synack was not acknowledged for 3 seconds, it means
592 * one of the following things: synack was lost, ack was lost,
593 * rtt is high or nobody planned to ack (i.e. synflood).
594 * When server is a bit loaded, queue is populated with old
595 * open requests, reducing effective size of queue.
596 * When server is well loaded, queue size reduces to zero
597 * after several minutes of work. It is not synflood,
598 * it is normal operation. The solution is pruning
599 * too old entries overriding normal timeout, when
600 * situation becomes dangerous.
602 * Essentially, we reserve half of room for young
603 * embrions; and abort old ones without pity, if old
604 * ones are about to clog our table.
606 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
607 int young = (lopt->qlen_young<<1);
609 while (thresh > 2) {
610 if (lopt->qlen < young)
611 break;
612 thresh--;
613 young <<= 1;
617 if (tp->defer_accept)
618 max_retries = tp->defer_accept;
620 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
621 i = lopt->clock_hand;
623 do {
624 reqp=&lopt->syn_table[i];
625 while ((req = *reqp) != NULL) {
626 if ((long)(now - req->expires) >= 0) {
627 if ((req->retrans < thresh ||
628 (req->acked && req->retrans < max_retries))
629 && !req->class->rtx_syn_ack(sk, req, NULL)) {
630 unsigned long timeo;
632 if (req->retrans++ == 0)
633 lopt->qlen_young--;
634 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
635 TCP_RTO_MAX);
636 req->expires = now + timeo;
637 reqp = &req->dl_next;
638 continue;
641 /* Drop this request */
642 write_lock(&tp->syn_wait_lock);
643 *reqp = req->dl_next;
644 write_unlock(&tp->syn_wait_lock);
645 lopt->qlen--;
646 if (req->retrans == 0)
647 lopt->qlen_young--;
648 tcp_openreq_free(req);
650 reqp = &req->dl_next;
653 i = (i+1)&(TCP_SYNQ_HSIZE-1);
655 } while (--budget > 0);
657 lopt->clock_hand = i;
659 if (lopt->qlen)
660 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
663 void tcp_delete_keepalive_timer (struct sock *sk)
665 spin_lock_bh(&sk->timer_lock);
666 if (sk->timer.prev && del_timer (&sk->timer))
667 __sock_put(sk);
668 spin_unlock_bh(&sk->timer_lock);
671 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
673 spin_lock_bh(&sk->timer_lock);
674 if(!sk->timer.prev || !del_timer(&sk->timer))
675 sock_hold(sk);
676 mod_timer(&sk->timer, jiffies+len);
677 spin_unlock_bh(&sk->timer_lock);
680 void tcp_set_keepalive(struct sock *sk, int val)
682 if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
683 return;
685 if (val && !sk->keepopen)
686 tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
687 else if (!val)
688 tcp_delete_keepalive_timer(sk);
692 static void tcp_keepalive_timer (unsigned long data)
694 struct sock *sk = (struct sock *) data;
695 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
696 __u32 elapsed;
698 /* Only process if socket is not in use. */
699 bh_lock_sock(sk);
700 if (sk->lock.users) {
701 /* Try again later. */
702 tcp_reset_keepalive_timer (sk, HZ/20);
703 goto out;
706 if (sk->state == TCP_LISTEN) {
707 tcp_synack_timer(sk);
708 goto out;
711 if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
712 if (tp->linger2 >= 0) {
713 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
715 if (tmo > 0) {
716 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
717 goto out;
720 tcp_send_active_reset(sk, GFP_ATOMIC);
721 goto death;
724 if (!sk->keepopen || sk->state == TCP_CLOSE)
725 goto out;
727 elapsed = keepalive_time_when(tp);
729 /* It is alive without keepalive 8) */
730 if (tp->packets_out || tp->send_head)
731 goto resched;
733 elapsed = tcp_time_stamp - tp->rcv_tstamp;
735 if (elapsed >= keepalive_time_when(tp)) {
736 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
737 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
738 tcp_send_active_reset(sk, GFP_ATOMIC);
739 tcp_write_err(sk);
740 goto out;
742 if (tcp_write_wakeup(sk) <= 0) {
743 tp->probes_out++;
744 elapsed = keepalive_intvl_when(tp);
745 } else {
746 /* If keepalive was lost due to local congestion,
747 * try harder.
749 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
751 } else {
752 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
753 elapsed = keepalive_time_when(tp) - elapsed;
756 TCP_CHECK_TIMER(sk);
758 resched:
759 tcp_reset_keepalive_timer (sk, elapsed);
760 goto out;
762 death:
763 tcp_done(sk);
765 out:
766 bh_unlock_sock(sk);
767 sock_put(sk);