pre-2.3.4..
[davej-history.git] / net / ipv4 / tcp_timer.c
blob41f2770a1d3e5c335b79daba79e14e17e3061686
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.63 1999/05/15 23:02:21 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <net/tcp.h>
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
27 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
28 int sysctl_tcp_retries1 = TCP_RETR1;
29 int sysctl_tcp_retries2 = TCP_RETR2;
31 static void tcp_sltimer_handler(unsigned long);
32 static void tcp_syn_recv_timer(unsigned long);
33 static void tcp_keepalive(unsigned long data);
34 static void tcp_bucketgc(unsigned long);
35 static void tcp_twkill(unsigned long);
37 struct timer_list tcp_slow_timer = {
38 NULL, NULL,
39 0, 0,
40 tcp_sltimer_handler,
44 struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
45 {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
46 {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */
47 {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}, /* TWKILL */
48 {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */
51 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
54 * Using different timers for retransmit, delayed acks and probes
55 * We may wish use just one timer maintaining a list of expire jiffies
56 * to optimize.
59 void tcp_init_xmit_timers(struct sock *sk)
61 init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
62 sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
63 sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
65 init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
66 sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
67 sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
69 init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
70 sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
71 sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
75 * Reset the retransmission timer
78 void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
80 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
82 switch (what) {
83 case TIME_RETRANS:
84 /* When seting the transmit timer the probe timer
85 * should not be set.
86 * The delayed ack timer can be set if we are changing the
87 * retransmit timer when removing acked frames.
89 if(tp->probe_timer.prev)
90 del_timer(&tp->probe_timer);
91 mod_timer(&tp->retransmit_timer, jiffies+when);
92 break;
94 case TIME_DACK:
95 mod_timer(&tp->delack_timer, jiffies+when);
96 break;
98 case TIME_PROBE0:
99 mod_timer(&tp->probe_timer, jiffies+when);
100 break;
102 case TIME_WRITE:
103 printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
104 break;
106 default:
107 printk(KERN_DEBUG "bug: unknown timer value\n");
111 void tcp_clear_xmit_timers(struct sock *sk)
113 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
115 if(tp->retransmit_timer.prev)
116 del_timer(&tp->retransmit_timer);
117 if(tp->delack_timer.prev)
118 del_timer(&tp->delack_timer);
119 if(tp->probe_timer.prev)
120 del_timer(&tp->probe_timer);
123 static int tcp_write_err(struct sock *sk, int force)
125 sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
126 sk->error_report(sk);
128 tcp_clear_xmit_timers(sk);
130 /* Time wait the socket. */
131 if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
132 tcp_time_wait(sk);
133 } else {
134 /* Clean up time. */
135 tcp_set_state(sk, TCP_CLOSE);
136 return 0;
138 return 1;
141 /* A write timeout has occurred. Process the after effects. */
142 static int tcp_write_timeout(struct sock *sk)
144 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
146 /* Look for a 'soft' timeout. */
147 if ((sk->state == TCP_ESTABLISHED &&
148 tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
149 (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
150 dst_negative_advice(&sk->dst_cache);
153 /* Have we tried to SYN too many times (repent repent 8)) */
154 if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
155 tcp_write_err(sk, 1);
156 /* Don't FIN, we got nothing back */
157 return 0;
160 /* Has it gone just too far? */
161 if (tp->retransmits > sysctl_tcp_retries2)
162 return tcp_write_err(sk, 0);
164 return 1;
167 void tcp_delack_timer(unsigned long data)
169 struct sock *sk = (struct sock*)data;
171 if(!sk->zapped &&
172 sk->tp_pinfo.af_tcp.delayed_acks &&
173 sk->state != TCP_CLOSE) {
174 bh_lock_sock(sk);
175 if (!sk->lock.users)
176 tcp_send_ack(sk);
177 else
178 tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
179 bh_unlock_sock(sk);
183 void tcp_probe_timer(unsigned long data)
185 struct sock *sk = (struct sock*)data;
186 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
188 if(sk->zapped)
189 return;
191 bh_lock_sock(sk);
192 if (sk->lock.users) {
193 /* Try again later. */
194 tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
195 bh_unlock_sock(sk);
196 return;
199 /* *WARNING* RFC 1122 forbids this
200 * It doesn't AFAIK, because we kill the retransmit timer -AK
201 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
202 * this behaviour in Solaris down as a bug fix. [AC]
204 if (tp->probes_out > sysctl_tcp_retries2) {
205 if(sk->err_soft)
206 sk->err = sk->err_soft;
207 else
208 sk->err = ETIMEDOUT;
209 sk->error_report(sk);
211 if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
212 /* Time wait the socket. */
213 tcp_time_wait(sk);
214 } else {
215 /* Clean up time. */
216 tcp_set_state(sk, TCP_CLOSE);
218 } else {
219 /* Only send another probe if we didn't close things up. */
220 tcp_send_probe0(sk);
222 bh_unlock_sock(sk);
225 static __inline__ int tcp_keepopen_proc(struct sock *sk)
227 int res = 0;
229 if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
230 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
231 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
233 if (elapsed >= sysctl_tcp_keepalive_time) {
234 if (tp->probes_out > sysctl_tcp_keepalive_probes) {
235 if(sk->err_soft)
236 sk->err = sk->err_soft;
237 else
238 sk->err = ETIMEDOUT;
240 tcp_set_state(sk, TCP_CLOSE);
241 sk->shutdown = SHUTDOWN_MASK;
242 if (!sk->dead)
243 sk->state_change(sk);
244 } else {
245 tp->probes_out++;
246 tp->pending = TIME_KEEPOPEN;
247 tcp_write_wakeup(sk);
248 res = 1;
252 return res;
255 /* Garbage collect TCP bind buckets. */
256 static void tcp_bucketgc(unsigned long data)
258 int i, reaped = 0;;
260 SOCKHASH_LOCK_WRITE_BH();
261 for(i = 0; i < tcp_bhash_size; i++) {
262 struct tcp_bind_bucket *tb = tcp_bhash[i];
264 while(tb) {
265 struct tcp_bind_bucket *next = tb->next;
267 if((tb->owners == NULL) &&
268 !(tb->flags & TCPB_FLAG_LOCKED)) {
269 reaped++;
271 /* Unlink bucket. */
272 if(tb->next)
273 tb->next->pprev = tb->pprev;
274 *tb->pprev = tb->next;
276 /* Finally, free it up. */
277 kmem_cache_free(tcp_bucket_cachep, tb);
279 tb = next;
282 SOCKHASH_UNLOCK_WRITE_BH();
284 if(reaped != 0) {
285 struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
287 /* Eat timer references. */
288 atomic_sub(reaped, &slt->count);
292 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
293 int tcp_tw_death_row_slot = 0;
294 static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
295 { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
297 extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
299 static void tcp_twkill(unsigned long data)
301 struct tcp_tw_bucket *tw;
302 int killed = 0;
304 /* The death-row tw chains are only ever touched
305 * in BH context so no locking is needed.
307 tw = tcp_tw_death_row[tcp_tw_death_row_slot];
308 tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
309 tcp_tw_death_row_slot =
310 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
312 while(tw != NULL) {
313 struct tcp_tw_bucket *next = tw->next_death;
315 tcp_timewait_kill(tw);
316 killed++;
317 tw = next;
319 if(killed != 0) {
320 struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
321 atomic_sub(killed, &slt->count);
325 /* These are always called from BH context. See callers in
326 * tcp_input.c to verify this.
328 void tcp_tw_schedule(struct tcp_tw_bucket *tw)
330 int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
331 struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
333 SOCKHASH_LOCK_WRITE_BH();
334 if((tw->next_death = *tpp) != NULL)
335 (*tpp)->pprev_death = &tw->next_death;
336 *tpp = tw;
337 tw->pprev_death = tpp;
339 tw->death_slot = slot;
340 SOCKHASH_UNLOCK_WRITE_BH();
342 tcp_inc_slow_timer(TCP_SLT_TWKILL);
345 /* Happens rarely if at all, no care about scalability here. */
346 void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
348 struct tcp_tw_bucket **tpp;
349 int slot;
351 SOCKHASH_LOCK_WRITE_BH();
352 if(tw->next_death)
353 tw->next_death->pprev_death = tw->pprev_death;
354 *tw->pprev_death = tw->next_death;
355 tw->pprev_death = NULL;
357 slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
358 tpp = &tcp_tw_death_row[slot];
359 if((tw->next_death = *tpp) != NULL)
360 (*tpp)->pprev_death = &tw->next_death;
361 *tpp = tw;
362 tw->pprev_death = tpp;
364 tw->death_slot = slot;
365 SOCKHASH_UNLOCK_WRITE_BH();
367 /* Timer was incremented when we first entered the table. */
370 /* This is for handling early-kills of TIME_WAIT sockets. */
371 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
373 SOCKHASH_LOCK_WRITE_BH();
374 if(tw->next_death)
375 tw->next_death->pprev_death = tw->pprev_death;
376 *tw->pprev_death = tw->next_death;
377 tw->pprev_death = NULL;
378 SOCKHASH_UNLOCK_WRITE_BH();
380 tcp_dec_slow_timer(TCP_SLT_TWKILL);
384 * Check all sockets for keepalive timer
385 * Called every 75 seconds
386 * This timer is started by af_inet init routine and is constantly
387 * running.
389 * It might be better to maintain a count of sockets that need it using
390 * setsockopt/tcp_destroy_sk and only set the timer when needed.
394 * don't send over 5 keepopens at a time to avoid burstiness
395 * on big servers [AC]
397 #define MAX_KA_PROBES 5
399 int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
401 /* Keepopen's are only valid for "established" TCP's, nicely our listener
402 * hash gets rid of most of the useless testing, so we run through a couple
403 * of the established hash chains each clock tick. -DaveM
405 * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
406 * going off for them, so we only need check the first half of the established
407 * hash table, even less testing under heavy load.
409 * I _really_ would rather do this by adding a new timer_struct to struct sock,
410 * and this way only those who set the keepalive option will get the overhead.
411 * The idea is you set it for 2 hours when the sock is first connected, when it
412 * does fire off (if at all, most sockets die earlier) you check for the keepalive
413 * option and also if the sock has been idle long enough to start probing.
415 static void tcp_keepalive(unsigned long data)
417 static int chain_start = 0;
418 int count = 0;
419 int i;
421 SOCKHASH_LOCK_READ_BH();
422 for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) {
423 struct sock *sk;
425 sk = tcp_ehash[i];
426 while(sk) {
427 struct sock *next = sk->next;
429 bh_lock_sock(sk);
430 if (sk->keepopen && !sk->lock.users) {
431 SOCKHASH_UNLOCK_READ_BH();
432 count += tcp_keepopen_proc(sk);
433 SOCKHASH_LOCK_READ_BH();
435 bh_unlock_sock(sk);
436 if(count == sysctl_tcp_max_ka_probes)
437 goto out;
438 sk = next;
441 out:
442 SOCKHASH_UNLOCK_READ_BH();
443 chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) &
444 ((tcp_ehash_size >> 1) - 1));
448 * The TCP retransmit timer. This lacks a few small details.
450 * 1. An initial rtt timeout on the probe0 should cause what we can
451 * of the first write queue buffer to be split and sent.
452 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
453 * ETIMEDOUT if we know an additional 'soft' error caused this.
454 * tcp_err should save a 'soft error' for us.
455 * [Unless someone has broken it then it does, except for one 2.0
456 * broken case of a send when the route/device is directly unreachable,
457 * and we error but should retry! - FIXME] [AC]
460 void tcp_retransmit_timer(unsigned long data)
462 struct sock *sk = (struct sock*)data;
463 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
465 /* We are reset. We will send no more retransmits. */
466 if(sk->zapped) {
467 tcp_clear_xmit_timer(sk, TIME_RETRANS);
468 return;
471 bh_lock_sock(sk);
472 if (sk->lock.users) {
473 /* Try again later */
474 tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
475 bh_unlock_sock(sk);
476 return;
479 /* Clear delay ack timer. */
480 tcp_clear_xmit_timer(sk, TIME_DACK);
482 /* RFC 2018, clear all 'sacked' flags in retransmission queue,
483 * the sender may have dropped out of order frames and we must
484 * send them out should this timer fire on us.
486 if(tp->sack_ok) {
487 struct sk_buff *skb = skb_peek(&sk->write_queue);
489 while((skb != NULL) &&
490 (skb != tp->send_head) &&
491 (skb != (struct sk_buff *)&sk->write_queue)) {
492 TCP_SKB_CB(skb)->sacked &=
493 ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
494 skb = skb->next;
498 /* Retransmission. */
499 tp->retrans_head = NULL;
500 tp->rexmt_done = 0;
501 tp->fackets_out = 0;
502 tp->retrans_out = 0;
503 if (tp->retransmits == 0) {
504 /* Remember window where we lost:
505 * "one half of the current window but at least 2 segments"
507 * Here "current window" means the effective one, which
508 * means it must be an accurate representation of our current
509 * sending rate _and_ the snd_wnd.
511 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
512 tp->snd_cwnd_cnt = 0;
513 tp->snd_cwnd = 1;
516 tp->retransmits++;
518 tp->dup_acks = 0;
519 tp->high_seq = tp->snd_nxt;
520 tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
522 /* Increase the timeout each time we retransmit. Note that
523 * we do not increase the rtt estimate. rto is initialized
524 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
525 * that doubling rto each time is the least we can get away with.
526 * In KA9Q, Karn uses this for the first few times, and then
527 * goes to quadratic. netBSD doubles, but only goes up to *64,
528 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
529 * defined in the protocol as the maximum possible RTT. I guess
530 * we'll have to use something other than TCP to talk to the
531 * University of Mars.
533 * PAWS allows us longer timeouts and large windows, so once
534 * implemented ftp to mars will work nicely. We will have to fix
535 * the 120 second clamps though!
537 tp->backoff++;
538 tp->rto = min(tp->rto << 1, 120*HZ);
539 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
541 tcp_write_timeout(sk);
543 bh_unlock_sock(sk);
547 * Slow timer for SYN-RECV sockets
550 static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
552 struct open_request *prev, *req;
554 prev = (struct open_request *) &tp->syn_wait_queue;
555 for(req = tp->syn_wait_queue; req; ) {
556 struct open_request *next = req->dl_next;
558 if (! req->sk) {
559 tcp_synq_unlink(tp, req, prev);
560 if(req->retrans >= sysctl_tcp_retries1) {
561 (*req->class->destructor)(req);
562 tcp_dec_slow_timer(TCP_SLT_SYNACK);
563 tp->syn_backlog--;
564 tcp_openreq_free(req);
565 if (! tp->syn_wait_queue)
566 break;
567 } else {
568 unsigned long timeo;
569 struct open_request *rp;
571 (*req->class->rtx_syn_ack)(sk, req);
572 req->retrans++;
573 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
574 (120 * HZ));
575 req->expires = now + timeo;
576 rp = prev->dl_next;
577 tcp_synq_queue(tp, req);
578 if(rp != prev->dl_next)
579 prev = prev->dl_next;
581 } else
582 prev = req;
583 req = next;
587 /* This now scales very nicely. -DaveM */
588 static void tcp_syn_recv_timer(unsigned long data)
590 struct sock *sk;
591 unsigned long now = jiffies;
592 int i;
594 SOCKHASH_LOCK_READ_BH();
595 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
596 sk = tcp_listening_hash[i];
597 while(sk) {
598 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
600 /* TCP_LISTEN is implied. */
601 bh_lock_sock(sk);
602 if (!sk->lock.users && tp->syn_wait_queue)
603 tcp_do_syn_queue(sk, tp, now);
604 bh_unlock_sock(sk);
605 sk = sk->next;
608 SOCKHASH_UNLOCK_READ_BH();
611 void tcp_sltimer_handler(unsigned long data)
613 struct tcp_sl_timer *slt = tcp_slt_array;
614 unsigned long next = ~0UL;
615 unsigned long now = jiffies;
616 int i;
618 for (i=0; i < TCP_SLT_MAX; i++, slt++) {
619 if (atomic_read(&slt->count)) {
620 long trigger;
622 trigger = slt->period - ((long)(now - slt->last));
624 if (trigger <= 0) {
625 (*slt->handler)((unsigned long) slt);
626 slt->last = now;
627 trigger = slt->period;
630 /* Only reschedule if some events remain. */
631 if (atomic_read(&slt->count))
632 next = min(next, trigger);
635 if (next != ~0UL)
636 mod_timer(&tcp_slow_timer, (now + next));
639 void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
641 unsigned long now = jiffies;
642 unsigned long when;
644 slt->last = now;
646 when = now + slt->period;
648 if (tcp_slow_timer.prev) {
649 if ((long)(tcp_slow_timer.expires - when) >= 0)
650 mod_timer(&tcp_slow_timer, when);
651 } else {
652 tcp_slow_timer.expires = when;
653 add_timer(&tcp_slow_timer);