2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.63 1999/05/15 23:02:21 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
25 int sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
26 int sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
27 int sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
28 int sysctl_tcp_retries1
= TCP_RETR1
;
29 int sysctl_tcp_retries2
= TCP_RETR2
;
31 static void tcp_sltimer_handler(unsigned long);
32 static void tcp_syn_recv_timer(unsigned long);
33 static void tcp_keepalive(unsigned long data
);
34 static void tcp_bucketgc(unsigned long);
35 static void tcp_twkill(unsigned long);
37 struct timer_list tcp_slow_timer
= {
44 struct tcp_sl_timer tcp_slt_array
[TCP_SLT_MAX
] = {
45 {ATOMIC_INIT(0), TCP_SYNACK_PERIOD
, 0, tcp_syn_recv_timer
},/* SYNACK */
46 {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD
, 0, tcp_keepalive
}, /* KEEPALIVE */
47 {ATOMIC_INIT(0), TCP_TWKILL_PERIOD
, 0, tcp_twkill
}, /* TWKILL */
48 {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD
, 0, tcp_bucketgc
} /* BUCKETGC */
51 const char timer_bug_msg
[] = KERN_DEBUG
"tcpbug: unknown timer value\n";
54 * Using different timers for retransmit, delayed acks and probes
55 * We may wish use just one timer maintaining a list of expire jiffies
59 void tcp_init_xmit_timers(struct sock
*sk
)
61 init_timer(&sk
->tp_pinfo
.af_tcp
.retransmit_timer
);
62 sk
->tp_pinfo
.af_tcp
.retransmit_timer
.function
=&tcp_retransmit_timer
;
63 sk
->tp_pinfo
.af_tcp
.retransmit_timer
.data
= (unsigned long) sk
;
65 init_timer(&sk
->tp_pinfo
.af_tcp
.delack_timer
);
66 sk
->tp_pinfo
.af_tcp
.delack_timer
.function
=&tcp_delack_timer
;
67 sk
->tp_pinfo
.af_tcp
.delack_timer
.data
= (unsigned long) sk
;
69 init_timer(&sk
->tp_pinfo
.af_tcp
.probe_timer
);
70 sk
->tp_pinfo
.af_tcp
.probe_timer
.function
=&tcp_probe_timer
;
71 sk
->tp_pinfo
.af_tcp
.probe_timer
.data
= (unsigned long) sk
;
75 * Reset the retransmission timer
78 void tcp_reset_xmit_timer(struct sock
*sk
, int what
, unsigned long when
)
80 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
84 /* When seting the transmit timer the probe timer
86 * The delayed ack timer can be set if we are changing the
87 * retransmit timer when removing acked frames.
89 if(tp
->probe_timer
.prev
)
90 del_timer(&tp
->probe_timer
);
91 mod_timer(&tp
->retransmit_timer
, jiffies
+when
);
95 mod_timer(&tp
->delack_timer
, jiffies
+when
);
99 mod_timer(&tp
->probe_timer
, jiffies
+when
);
103 printk(KERN_DEBUG
"bug: tcp_reset_xmit_timer TIME_WRITE\n");
107 printk(KERN_DEBUG
"bug: unknown timer value\n");
111 void tcp_clear_xmit_timers(struct sock
*sk
)
113 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
115 if(tp
->retransmit_timer
.prev
)
116 del_timer(&tp
->retransmit_timer
);
117 if(tp
->delack_timer
.prev
)
118 del_timer(&tp
->delack_timer
);
119 if(tp
->probe_timer
.prev
)
120 del_timer(&tp
->probe_timer
);
123 static int tcp_write_err(struct sock
*sk
, int force
)
125 sk
->err
= sk
->err_soft
? sk
->err_soft
: ETIMEDOUT
;
126 sk
->error_report(sk
);
128 tcp_clear_xmit_timers(sk
);
130 /* Time wait the socket. */
131 if (!force
&& ((1<<sk
->state
) & (TCPF_FIN_WAIT1
|TCPF_FIN_WAIT2
|TCPF_CLOSING
))) {
135 tcp_set_state(sk
, TCP_CLOSE
);
141 /* A write timeout has occurred. Process the after effects. */
142 static int tcp_write_timeout(struct sock
*sk
)
144 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
146 /* Look for a 'soft' timeout. */
147 if ((sk
->state
== TCP_ESTABLISHED
&&
148 tp
->retransmits
&& (tp
->retransmits
% TCP_QUICK_TRIES
) == 0) ||
149 (sk
->state
!= TCP_ESTABLISHED
&& tp
->retransmits
> sysctl_tcp_retries1
)) {
150 dst_negative_advice(&sk
->dst_cache
);
153 /* Have we tried to SYN too many times (repent repent 8)) */
154 if(tp
->retransmits
> sysctl_tcp_syn_retries
&& sk
->state
==TCP_SYN_SENT
) {
155 tcp_write_err(sk
, 1);
156 /* Don't FIN, we got nothing back */
160 /* Has it gone just too far? */
161 if (tp
->retransmits
> sysctl_tcp_retries2
)
162 return tcp_write_err(sk
, 0);
167 void tcp_delack_timer(unsigned long data
)
169 struct sock
*sk
= (struct sock
*)data
;
172 sk
->tp_pinfo
.af_tcp
.delayed_acks
&&
173 sk
->state
!= TCP_CLOSE
) {
178 tcp_send_delayed_ack(&(sk
->tp_pinfo
.af_tcp
), HZ
/10);
183 void tcp_probe_timer(unsigned long data
)
185 struct sock
*sk
= (struct sock
*)data
;
186 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
192 if (sk
->lock
.users
) {
193 /* Try again later. */
194 tcp_reset_xmit_timer(sk
, TIME_PROBE0
, HZ
/5);
199 /* *WARNING* RFC 1122 forbids this
200 * It doesn't AFAIK, because we kill the retransmit timer -AK
201 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
202 * this behaviour in Solaris down as a bug fix. [AC]
204 if (tp
->probes_out
> sysctl_tcp_retries2
) {
206 sk
->err
= sk
->err_soft
;
209 sk
->error_report(sk
);
211 if ((1<<sk
->state
) & (TCPF_FIN_WAIT1
|TCPF_FIN_WAIT2
|TCPF_CLOSING
)) {
212 /* Time wait the socket. */
216 tcp_set_state(sk
, TCP_CLOSE
);
219 /* Only send another probe if we didn't close things up. */
225 static __inline__
int tcp_keepopen_proc(struct sock
*sk
)
229 if ((1<<sk
->state
) & (TCPF_ESTABLISHED
|TCPF_CLOSE_WAIT
|TCPF_FIN_WAIT2
)) {
230 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
231 __u32 elapsed
= tcp_time_stamp
- tp
->rcv_tstamp
;
233 if (elapsed
>= sysctl_tcp_keepalive_time
) {
234 if (tp
->probes_out
> sysctl_tcp_keepalive_probes
) {
236 sk
->err
= sk
->err_soft
;
240 tcp_set_state(sk
, TCP_CLOSE
);
241 sk
->shutdown
= SHUTDOWN_MASK
;
243 sk
->state_change(sk
);
246 tp
->pending
= TIME_KEEPOPEN
;
247 tcp_write_wakeup(sk
);
255 /* Garbage collect TCP bind buckets. */
256 static void tcp_bucketgc(unsigned long data
)
260 SOCKHASH_LOCK_WRITE_BH();
261 for(i
= 0; i
< tcp_bhash_size
; i
++) {
262 struct tcp_bind_bucket
*tb
= tcp_bhash
[i
];
265 struct tcp_bind_bucket
*next
= tb
->next
;
267 if((tb
->owners
== NULL
) &&
268 !(tb
->flags
& TCPB_FLAG_LOCKED
)) {
273 tb
->next
->pprev
= tb
->pprev
;
274 *tb
->pprev
= tb
->next
;
276 /* Finally, free it up. */
277 kmem_cache_free(tcp_bucket_cachep
, tb
);
282 SOCKHASH_UNLOCK_WRITE_BH();
285 struct tcp_sl_timer
*slt
= (struct tcp_sl_timer
*)data
;
287 /* Eat timer references. */
288 atomic_sub(reaped
, &slt
->count
);
292 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
293 int tcp_tw_death_row_slot
= 0;
294 static struct tcp_tw_bucket
*tcp_tw_death_row
[TCP_TWKILL_SLOTS
] =
295 { NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
};
297 extern void tcp_timewait_kill(struct tcp_tw_bucket
*tw
);
299 static void tcp_twkill(unsigned long data
)
301 struct tcp_tw_bucket
*tw
;
304 /* The death-row tw chains are only ever touched
305 * in BH context so no locking is needed.
307 tw
= tcp_tw_death_row
[tcp_tw_death_row_slot
];
308 tcp_tw_death_row
[tcp_tw_death_row_slot
] = NULL
;
309 tcp_tw_death_row_slot
=
310 ((tcp_tw_death_row_slot
+ 1) & (TCP_TWKILL_SLOTS
- 1));
313 struct tcp_tw_bucket
*next
= tw
->next_death
;
315 tcp_timewait_kill(tw
);
320 struct tcp_sl_timer
*slt
= (struct tcp_sl_timer
*)data
;
321 atomic_sub(killed
, &slt
->count
);
325 /* These are always called from BH context. See callers in
326 * tcp_input.c to verify this.
328 void tcp_tw_schedule(struct tcp_tw_bucket
*tw
)
330 int slot
= (tcp_tw_death_row_slot
- 1) & (TCP_TWKILL_SLOTS
- 1);
331 struct tcp_tw_bucket
**tpp
= &tcp_tw_death_row
[slot
];
333 SOCKHASH_LOCK_WRITE_BH();
334 if((tw
->next_death
= *tpp
) != NULL
)
335 (*tpp
)->pprev_death
= &tw
->next_death
;
337 tw
->pprev_death
= tpp
;
339 tw
->death_slot
= slot
;
340 SOCKHASH_UNLOCK_WRITE_BH();
342 tcp_inc_slow_timer(TCP_SLT_TWKILL
);
345 /* Happens rarely if at all, no care about scalability here. */
346 void tcp_tw_reschedule(struct tcp_tw_bucket
*tw
)
348 struct tcp_tw_bucket
**tpp
;
351 SOCKHASH_LOCK_WRITE_BH();
353 tw
->next_death
->pprev_death
= tw
->pprev_death
;
354 *tw
->pprev_death
= tw
->next_death
;
355 tw
->pprev_death
= NULL
;
357 slot
= (tcp_tw_death_row_slot
- 1) & (TCP_TWKILL_SLOTS
- 1);
358 tpp
= &tcp_tw_death_row
[slot
];
359 if((tw
->next_death
= *tpp
) != NULL
)
360 (*tpp
)->pprev_death
= &tw
->next_death
;
362 tw
->pprev_death
= tpp
;
364 tw
->death_slot
= slot
;
365 SOCKHASH_UNLOCK_WRITE_BH();
367 /* Timer was incremented when we first entered the table. */
370 /* This is for handling early-kills of TIME_WAIT sockets. */
371 void tcp_tw_deschedule(struct tcp_tw_bucket
*tw
)
373 SOCKHASH_LOCK_WRITE_BH();
375 tw
->next_death
->pprev_death
= tw
->pprev_death
;
376 *tw
->pprev_death
= tw
->next_death
;
377 tw
->pprev_death
= NULL
;
378 SOCKHASH_UNLOCK_WRITE_BH();
380 tcp_dec_slow_timer(TCP_SLT_TWKILL
);
384 * Check all sockets for keepalive timer
385 * Called every 75 seconds
386 * This timer is started by af_inet init routine and is constantly
389 * It might be better to maintain a count of sockets that need it using
390 * setsockopt/tcp_destroy_sk and only set the timer when needed.
394 * don't send over 5 keepopens at a time to avoid burstiness
395 * on big servers [AC]
397 #define MAX_KA_PROBES 5
399 int sysctl_tcp_max_ka_probes
= MAX_KA_PROBES
;
401 /* Keepopen's are only valid for "established" TCP's, nicely our listener
402 * hash gets rid of most of the useless testing, so we run through a couple
403 * of the established hash chains each clock tick. -DaveM
405 * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
406 * going off for them, so we only need check the first half of the established
407 * hash table, even less testing under heavy load.
409 * I _really_ would rather do this by adding a new timer_struct to struct sock,
410 * and this way only those who set the keepalive option will get the overhead.
411 * The idea is you set it for 2 hours when the sock is first connected, when it
412 * does fire off (if at all, most sockets die earlier) you check for the keepalive
413 * option and also if the sock has been idle long enough to start probing.
415 static void tcp_keepalive(unsigned long data
)
417 static int chain_start
= 0;
421 SOCKHASH_LOCK_READ_BH();
422 for(i
= chain_start
; i
< (chain_start
+ ((tcp_ehash_size
>> 1) >> 2)); i
++) {
427 struct sock
*next
= sk
->next
;
430 if (sk
->keepopen
&& !sk
->lock
.users
) {
431 SOCKHASH_UNLOCK_READ_BH();
432 count
+= tcp_keepopen_proc(sk
);
433 SOCKHASH_LOCK_READ_BH();
436 if(count
== sysctl_tcp_max_ka_probes
)
442 SOCKHASH_UNLOCK_READ_BH();
443 chain_start
= ((chain_start
+ ((tcp_ehash_size
>> 1)>>2)) &
444 ((tcp_ehash_size
>> 1) - 1));
448 * The TCP retransmit timer. This lacks a few small details.
450 * 1. An initial rtt timeout on the probe0 should cause what we can
451 * of the first write queue buffer to be split and sent.
452 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
453 * ETIMEDOUT if we know an additional 'soft' error caused this.
454 * tcp_err should save a 'soft error' for us.
455 * [Unless someone has broken it then it does, except for one 2.0
456 * broken case of a send when the route/device is directly unreachable,
457 * and we error but should retry! - FIXME] [AC]
460 void tcp_retransmit_timer(unsigned long data
)
462 struct sock
*sk
= (struct sock
*)data
;
463 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
465 /* We are reset. We will send no more retransmits. */
467 tcp_clear_xmit_timer(sk
, TIME_RETRANS
);
472 if (sk
->lock
.users
) {
473 /* Try again later */
474 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, HZ
/20);
479 /* Clear delay ack timer. */
480 tcp_clear_xmit_timer(sk
, TIME_DACK
);
482 /* RFC 2018, clear all 'sacked' flags in retransmission queue,
483 * the sender may have dropped out of order frames and we must
484 * send them out should this timer fire on us.
487 struct sk_buff
*skb
= skb_peek(&sk
->write_queue
);
489 while((skb
!= NULL
) &&
490 (skb
!= tp
->send_head
) &&
491 (skb
!= (struct sk_buff
*)&sk
->write_queue
)) {
492 TCP_SKB_CB(skb
)->sacked
&=
493 ~(TCPCB_SACKED_ACKED
| TCPCB_SACKED_RETRANS
);
498 /* Retransmission. */
499 tp
->retrans_head
= NULL
;
503 if (tp
->retransmits
== 0) {
504 /* Remember window where we lost:
505 * "one half of the current window but at least 2 segments"
507 * Here "current window" means the effective one, which
508 * means it must be an accurate representation of our current
509 * sending rate _and_ the snd_wnd.
511 tp
->snd_ssthresh
= tcp_recalc_ssthresh(tp
);
512 tp
->snd_cwnd_cnt
= 0;
519 tp
->high_seq
= tp
->snd_nxt
;
520 tcp_retransmit_skb(sk
, skb_peek(&sk
->write_queue
));
522 /* Increase the timeout each time we retransmit. Note that
523 * we do not increase the rtt estimate. rto is initialized
524 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
525 * that doubling rto each time is the least we can get away with.
526 * In KA9Q, Karn uses this for the first few times, and then
527 * goes to quadratic. netBSD doubles, but only goes up to *64,
528 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
529 * defined in the protocol as the maximum possible RTT. I guess
530 * we'll have to use something other than TCP to talk to the
531 * University of Mars.
533 * PAWS allows us longer timeouts and large windows, so once
534 * implemented ftp to mars will work nicely. We will have to fix
535 * the 120 second clamps though!
538 tp
->rto
= min(tp
->rto
<< 1, 120*HZ
);
539 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
);
541 tcp_write_timeout(sk
);
547 * Slow timer for SYN-RECV sockets
550 static void tcp_do_syn_queue(struct sock
*sk
, struct tcp_opt
*tp
, unsigned long now
)
552 struct open_request
*prev
, *req
;
554 prev
= (struct open_request
*) &tp
->syn_wait_queue
;
555 for(req
= tp
->syn_wait_queue
; req
; ) {
556 struct open_request
*next
= req
->dl_next
;
559 tcp_synq_unlink(tp
, req
, prev
);
560 if(req
->retrans
>= sysctl_tcp_retries1
) {
561 (*req
->class->destructor
)(req
);
562 tcp_dec_slow_timer(TCP_SLT_SYNACK
);
564 tcp_openreq_free(req
);
565 if (! tp
->syn_wait_queue
)
569 struct open_request
*rp
;
571 (*req
->class->rtx_syn_ack
)(sk
, req
);
573 timeo
= min((TCP_TIMEOUT_INIT
<< req
->retrans
),
575 req
->expires
= now
+ timeo
;
577 tcp_synq_queue(tp
, req
);
578 if(rp
!= prev
->dl_next
)
579 prev
= prev
->dl_next
;
587 /* This now scales very nicely. -DaveM */
588 static void tcp_syn_recv_timer(unsigned long data
)
591 unsigned long now
= jiffies
;
594 SOCKHASH_LOCK_READ_BH();
595 for(i
= 0; i
< TCP_LHTABLE_SIZE
; i
++) {
596 sk
= tcp_listening_hash
[i
];
598 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
600 /* TCP_LISTEN is implied. */
602 if (!sk
->lock
.users
&& tp
->syn_wait_queue
)
603 tcp_do_syn_queue(sk
, tp
, now
);
608 SOCKHASH_UNLOCK_READ_BH();
611 void tcp_sltimer_handler(unsigned long data
)
613 struct tcp_sl_timer
*slt
= tcp_slt_array
;
614 unsigned long next
= ~0UL;
615 unsigned long now
= jiffies
;
618 for (i
=0; i
< TCP_SLT_MAX
; i
++, slt
++) {
619 if (atomic_read(&slt
->count
)) {
622 trigger
= slt
->period
- ((long)(now
- slt
->last
));
625 (*slt
->handler
)((unsigned long) slt
);
627 trigger
= slt
->period
;
630 /* Only reschedule if some events remain. */
631 if (atomic_read(&slt
->count
))
632 next
= min(next
, trigger
);
636 mod_timer(&tcp_slow_timer
, (now
+ next
));
639 void __tcp_inc_slow_timer(struct tcp_sl_timer
*slt
)
641 unsigned long now
= jiffies
;
646 when
= now
+ slt
->period
;
648 if (tcp_slow_timer
.prev
) {
649 if ((long)(tcp_slow_timer
.expires
- when
) >= 0)
650 mod_timer(&tcp_slow_timer
, when
);
652 tcp_slow_timer
.expires
= when
;
653 add_timer(&tcp_slow_timer
);