net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <net/tcp.h>
  24
  25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  26 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  27 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  28 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  29 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  30 int sysctl_tcp_retries1 = TCP_RETR1;
  31 int sysctl_tcp_retries2 = TCP_RETR2;
  32 int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES;
  33
  34 static void tcp_retransmit_timer(unsigned long);
  35 static void tcp_delack_timer(unsigned long);
  36 static void tcp_probe_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38 static void tcp_twkill(unsigned long);
  39
  40 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
  41
  42 /*
  43  * Using different timers for retransmit, delayed acks and probes
  44  * We may wish use just one timer maintaining a list of expire jiffies
  45  * to optimize.
  46  */
  47
  48 void tcp_init_xmit_timers(struct sock *sk)
  49 {
  50         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  51
  52         spin_lock_init(&sk->timer_lock);
  53
  54         init_timer(&tp->retransmit_timer);
  55         tp->retransmit_timer.function=&tcp_retransmit_timer;
  56         tp->retransmit_timer.data = (unsigned long) sk;
  57
  58         init_timer(&tp->delack_timer);
  59         tp->delack_timer.function=&tcp_delack_timer;
  60         tp->delack_timer.data = (unsigned long) sk;
  61
  62         init_timer(&tp->probe_timer);
  63         tp->probe_timer.function=&tcp_probe_timer;
  64         tp->probe_timer.data = (unsigned long) sk;
  65
  66         init_timer(&sk->timer);
  67         sk->timer.function=&tcp_keepalive_timer;
  68         sk->timer.data = (unsigned long) sk;
  69 }
  70
  71 /*
  72  *      Reset the retransmission timer
  73  */
  74
  75 void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
  76 {
  77         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  78
  79         spin_lock_bh(&sk->timer_lock);
  80         switch (what) {
  81         case TCP_TIME_RETRANS:
  82                 /* When seting the transmit timer the probe timer
  83                  * should not be set.
  84                  * The delayed ack timer can be set if we are changing the
  85                  * retransmit timer when removing acked frames.
  86                  */
  87                 if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
  88                         __sock_put(sk);
  89                 if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
  90                         sock_hold(sk);
  91                 if (when > TCP_RTO_MAX) {
  92                         printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
  93                         when = TCP_RTO_MAX;
  94                 }
  95                 mod_timer(&tp->retransmit_timer, jiffies+when);
  96                 break;
  97
  98         case TCP_TIME_DACK:
  99                 if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
 100                         sock_hold(sk);
 101                 mod_timer(&tp->delack_timer, jiffies+when);
 102                 break;
 103
 104         case TCP_TIME_PROBE0:
 105                 if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
 106                         sock_hold(sk);
 107                 mod_timer(&tp->probe_timer, jiffies+when);
 108                 break;
 109
 110         default:
 111                 printk(KERN_DEBUG "bug: unknown timer value\n");
 112         };
 113         spin_unlock_bh(&sk->timer_lock);
 114 }
 115
 116 void tcp_clear_xmit_timers(struct sock *sk)
 117 {
 118         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 119
 120         spin_lock_bh(&sk->timer_lock);
 121         if(tp->retransmit_timer.prev && del_timer(&tp->retransmit_timer))
 122                 __sock_put(sk);
 123         if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
 124                 __sock_put(sk);
 125         tp->ack.blocked = 0;
 126         if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
 127                 __sock_put(sk);
 128         if(sk->timer.prev && del_timer(&sk->timer))
 129                 __sock_put(sk);
 130         spin_unlock_bh(&sk->timer_lock);
 131 }
 132
 133 static void tcp_write_err(struct sock *sk)
 134 {
 135         sk->err = sk->err_soft ? : ETIMEDOUT;
 136         sk->error_report(sk);
 137
 138         tcp_done(sk);
 139 }
 140
 141 /* A write timeout has occurred. Process the after effects. */
 142 static int tcp_write_timeout(struct sock *sk)
 143 {
 144         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 145         int retry_until;
 146
 147         if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 148                 if (tp->retransmits)
 149                         dst_negative_advice(&sk->dst_cache);
 150                 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
 151         } else {
 152                 if (tp->retransmits >= sysctl_tcp_retries1) {
 153                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 154                            hole detection. :-(
 155
 156                            It is place to make it. It is not made. I do not want
 157                            to make it. It is disguisting. It does not work in any
 158                            case. Let me to cite the same draft, which requires for
 159                            us to implement this:
 160
 161    "The one security concern raised by this memo is that ICMP black holes
 162    are often caused by over-zealous security administrators who block
 163    all ICMP messages.  It is vitally important that those who design and
 164    deploy security systems understand the impact of strict filtering on
 165    upper-layer protocols.  The safest web site in the world is worthless
 166    if most TCP implementations cannot transfer data from it.  It would
 167    be far nicer to have all of the black holes fixed rather than fixing
 168    all of the TCP implementations."
 169
 170                            Golden words :-).
 171                    */
 172
 173                         dst_negative_advice(&sk->dst_cache);
 174                 }
 175                 retry_until = sysctl_tcp_retries2;
 176                 if (sk->dead)
 177                         retry_until = sysctl_tcp_orphan_retries;
 178         }
 179
 180         if (tp->retransmits >= retry_until) {
 181                 /* Has it gone just too far? */
 182                 tcp_write_err(sk);
 183                 return 1;
 184         }
 185         return 0;
 186 }
 187
 188 static void tcp_delack_timer(unsigned long data)
 189 {
 190         struct sock *sk = (struct sock*)data;
 191         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 192
 193         bh_lock_sock(sk);
 194         if (sk->lock.users) {
 195                 /* Try again later. */
 196                 tp->ack.blocked = 1;
 197                 NET_INC_STATS_BH(DelayedACKLocked);
 198                 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
 199                 goto out_unlock;
 200         }
 201
 202         if (tp->ack.pending) {
 203                 /* Delayed ACK missed: inflate ATO, leave pingpong mode */
 204                 tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX);
 205                 tp->ack.pingpong = 0;
 206                 tcp_send_ack(sk);
 207                 NET_INC_STATS_BH(DelayedACKs);
 208         }
 209         TCP_CHECK_TIMER(sk);
 210
 211 out_unlock:
 212         bh_unlock_sock(sk);
 213         sock_put(sk);
 214 }
 215
 216 static void tcp_probe_timer(unsigned long data)
 217 {
 218         struct sock *sk = (struct sock*)data;
 219         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 220         int max_probes;
 221
 222         bh_lock_sock(sk);
 223         if (sk->lock.users) {
 224                 /* Try again later. */
 225                 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5);
 226                 goto out_unlock;
 227         }
 228
 229         if (sk->state == TCP_CLOSE)
 230                 goto out_unlock;
 231
 232         if (tp->packets_out || !tp->send_head) {
 233                 tp->probes_out = 0;
 234                 goto out_unlock;
 235         }
 236
 237         /* *WARNING* RFC 1122 forbids this
 238          *
 239          * It doesn't AFAIK, because we kill the retransmit timer -AK
 240          *
 241          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 242          * this behaviour in Solaris down as a bug fix. [AC]
 243          *
 244          * Let me to explain. probes_out is zeroed by incoming ACKs
 245          * even if they advertise zero window. Hence, connection is killed only
 246          * if we received no ACKs for normal connection timeout. It is not killed
 247          * only because window stays zero for some time, window may be zero
 248          * until armageddon and even later. We are in full accordance
 249          * with RFCs, only probe timer combines both retransmission timeout
 250          * and probe timeout in one bottle.                             --ANK
 251          */
 252         max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2;
 253
 254         if (tp->probes_out > max_probes) {
 255                 tcp_write_err(sk);
 256         } else {
 257                 /* Only send another probe if we didn't close things up. */
 258                 tcp_send_probe0(sk);
 259                 TCP_CHECK_TIMER(sk);
 260         }
 261 out_unlock:
 262         bh_unlock_sock(sk);
 263         sock_put(sk);
 264 }
 265
 266
 267 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 268 static int tcp_tw_death_row_slot = 0;
 269 int tcp_tw_count = 0;
 270
 271 static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
 272 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
 273 static struct timer_list tcp_tw_timer = { function: tcp_twkill };
 274
 275 static void tcp_twkill(unsigned long data)
 276 {
 277         struct tcp_tw_bucket *tw;
 278         int killed = 0;
 279
 280         /* NOTE: compare this to previous version where lock
 281          * was released after detaching chain. It was racy,
 282          * because tw buckets are scheduled in not serialized context
 283          * in 2.3 (with netfilter), and with softnet it is common, because
 284          * soft irqs are not sequenced.
 285          */
 286         spin_lock(&tw_death_lock);
 287
 288         if (tcp_tw_count == 0)
 289                 goto out;
 290
 291         while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
 292                 tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
 293                 tw->pprev_death = NULL;
 294                 spin_unlock(&tw_death_lock);
 295
 296                 tcp_timewait_kill(tw);
 297                 tcp_tw_put(tw);
 298
 299                 killed++;
 300
 301                 spin_lock(&tw_death_lock);
 302         }
 303         tcp_tw_death_row_slot =
 304                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 305
 306         if ((tcp_tw_count -= killed) != 0)
 307                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 308         net_statistics[smp_processor_id()*2].TimeWaited += killed;
 309 out:
 310         spin_unlock(&tw_death_lock);
 311 }
 312
 313 /* These are always called from BH context.  See callers in
 314  * tcp_input.c to verify this.
 315  */
 316
 317 /* This is for handling early-kills of TIME_WAIT sockets. */
 318 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
 319 {
 320         spin_lock(&tw_death_lock);
 321         if (tw->pprev_death) {
 322                 if(tw->next_death)
 323                         tw->next_death->pprev_death = tw->pprev_death;
 324                 *tw->pprev_death = tw->next_death;
 325                 tw->pprev_death = NULL;
 326                 tcp_tw_put(tw);
 327                 if (--tcp_tw_count == 0)
 328                         del_timer(&tcp_tw_timer);
 329         }
 330         spin_unlock(&tw_death_lock);
 331 }
 332
 333 /* Short-time timewait calendar */
 334
 335 static int tcp_twcal_hand = -1;
 336 static int tcp_twcal_jiffie;
 337 static void tcp_twcal_tick(unsigned long);
 338 static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,};
 339 static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 340
 341 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
 342 {
 343         struct tcp_tw_bucket **tpp;
 344         int slot;
 345
 346         /* timeout := RTO * 3.5
 347          *
 348          * 3.5 = 1+2+0.5 to wait for two retransmits.
 349          *
 350          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 351          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 352          * FINs (or previous seqments) are lost (probability of such event
 353          * is p^(N+1), where p is probability to lose single packet and
 354          * time to detect the loss is about RTO*(2^N - 1) with exponential
 355          * backoff). Normal timewait length is calculated so, that we
 356          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 357          * [ BTW Linux. following BSD, violates this requirement waiting
 358          *   only for 60sec, we should wait at least for 240 secs.
 359          *   Well, 240 consumes too much of resources 8)
 360          * ]
 361          * This interval is not reduced to catch old duplicate and
 362          * responces to our wandering segments living for two MSLs.
 363          * However, if we use PAWS to detect
 364          * old duplicates, we can reduce the interval to bounds required
 365          * by RTO, rather than MSL. So, if peer understands PAWS, we
 366          * kill tw bucket after 3.5*RTO (it is important that this number
 367          * is greater than TS tick!) and detect old duplicates with help
 368          * of PAWS.
 369          */
 370         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 371
 372         spin_lock(&tw_death_lock);
 373
 374         /* Unlink it, if it was scheduled */
 375         if (tw->pprev_death) {
 376                 if(tw->next_death)
 377                         tw->next_death->pprev_death = tw->pprev_death;
 378                 *tw->pprev_death = tw->next_death;
 379                 tw->pprev_death = NULL;
 380                 tcp_tw_count--;
 381         } else
 382                 atomic_inc(&tw->refcnt);
 383
 384         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 385                 /* Schedule to slow timer */
 386                 if (timeo >= TCP_TIMEWAIT_LEN) {
 387                         slot = TCP_TWKILL_SLOTS-1;
 388                 } else {
 389                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 390                         if (slot >= TCP_TWKILL_SLOTS)
 391                                 slot = TCP_TWKILL_SLOTS-1;
 392                 }
 393                 tw->ttd = jiffies + timeo;
 394                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 395                 tpp = &tcp_tw_death_row[slot];
 396         } else {
 397                 tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
 398
 399                 if (tcp_twcal_hand < 0) {
 400                         tcp_twcal_hand = 0;
 401                         tcp_twcal_jiffie = jiffies;
 402                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 403                         add_timer(&tcp_twcal_timer);
 404                 } else {
 405                         if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
 406                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 407                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 408                 }
 409                 tpp = &tcp_twcal_row[slot];
 410         }
 411
 412         if((tw->next_death = *tpp) != NULL)
 413                 (*tpp)->pprev_death = &tw->next_death;
 414         *tpp = tw;
 415         tw->pprev_death = tpp;
 416
 417         if (tcp_tw_count++ == 0)
 418                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 419         spin_unlock(&tw_death_lock);
 420 }
 421
 422 void tcp_twcal_tick(unsigned long dummy)
 423 {
 424         int n, slot;
 425         unsigned long j;
 426         unsigned long now = jiffies;
 427         int killed = 0;
 428         int adv = 0;
 429
 430         spin_lock(&tw_death_lock);
 431         if (tcp_twcal_hand < 0)
 432                 goto out;
 433
 434         slot = tcp_twcal_hand;
 435         j = tcp_twcal_jiffie;
 436
 437         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 438                 if ((long)(j - now) <= 0) {
 439                         struct tcp_tw_bucket *tw;
 440
 441                         while((tw = tcp_twcal_row[slot]) != NULL) {
 442                                 tcp_twcal_row[slot] = tw->next_death;
 443                                 tw->pprev_death = NULL;
 444
 445                                 tcp_timewait_kill(tw);
 446                                 tcp_tw_put(tw);
 447                                 killed++;
 448                         }
 449                 } else {
 450                         if (!adv) {
 451                                 adv = 1;
 452                                 tcp_twcal_jiffie = j;
 453                                 tcp_twcal_hand = slot;
 454                         }
 455
 456                         if (tcp_twcal_row[slot] != NULL) {
 457                                 mod_timer(&tcp_twcal_timer, j);
 458                                 goto out;
 459                         }
 460                 }
 461                 j += (1<<TCP_TW_RECYCLE_TICK);
 462                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 463         }
 464         tcp_twcal_hand = -1;
 465
 466 out:
 467         if ((tcp_tw_count -= killed) == 0)
 468                 del_timer(&tcp_tw_timer);
 469         net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
 470         spin_unlock(&tw_death_lock);
 471 }
 472
 473
 474 /*
 475  *      The TCP retransmit timer.
 476  */
 477
 478 static void tcp_retransmit_timer(unsigned long data)
 479 {
 480         struct sock *sk = (struct sock*)data;
 481         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 482
 483         bh_lock_sock(sk);
 484         if (sk->lock.users) {
 485                 /* Try again later */
 486                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20);
 487                 goto out_unlock;
 488         }
 489
 490         if (sk->state == TCP_CLOSE || tp->packets_out == 0)
 491                 goto out_unlock;
 492
 493         BUG_TRAP(!skb_queue_empty(&sk->write_queue));
 494
 495         if (tcp_write_timeout(sk))
 496                 goto out_unlock;
 497
 498         /* RFC 2018, clear all 'sacked' flags in retransmission queue,
 499          * the sender may have dropped out of order frames and we must
 500          * send them out should this timer fire on us.
 501          */
 502         if(tp->sack_ok) {
 503                 struct sk_buff *skb = skb_peek(&sk->write_queue);
 504
 505                 while((skb != NULL) &&
 506                       (skb != tp->send_head) &&
 507                       (skb != (struct sk_buff *)&sk->write_queue)) {
 508                         TCP_SKB_CB(skb)->sacked &=
 509                                 ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
 510                         skb = skb->next;
 511                 }
 512         }
 513
 514         /* Retransmission. */
 515         tp->retrans_head = NULL;
 516         tp->rexmt_done = 0;
 517         tp->fackets_out = 0;
 518         tp->retrans_out = 0;
 519         if (tp->retransmits == 0) {
 520                 /* Remember window where we lost:
 521                  * "one half of the current window but at least 2 segments"
 522                  *
 523                  * Here "current window" means the effective one, which
 524                  * means it must be an accurate representation of our current
 525                  * sending rate _and_ the snd_wnd.
 526                  */
 527                 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 528                 tp->snd_cwnd_cnt = 0;
 529                 tp->snd_cwnd = 1;
 530         }
 531
 532         tp->dup_acks = 0;
 533         tp->high_seq = tp->snd_nxt;
 534         if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
 535                 /* Retransmission failed because of local congestion,
 536                  * do not backoff.
 537                  */
 538                 if (!tp->retransmits)
 539                         tp->retransmits=1;
 540                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
 541                                      min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
 542                 TCP_CHECK_TIMER(sk);
 543                 goto out_unlock;
 544         }
 545
 546         /* Increase the timeout each time we retransmit.  Note that
 547          * we do not increase the rtt estimate.  rto is initialized
 548          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 549          * that doubling rto each time is the least we can get away with.
 550          * In KA9Q, Karn uses this for the first few times, and then
 551          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 552          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 553          * defined in the protocol as the maximum possible RTT.  I guess
 554          * we'll have to use something other than TCP to talk to the
 555          * University of Mars.
 556          *
 557          * PAWS allows us longer timeouts and large windows, so once
 558          * implemented ftp to mars will work nicely. We will have to fix
 559          * the 120 second clamps though!
 560          */
 561         tp->backoff++;
 562         tp->retransmits++;
 563         tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
 564         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 565         TCP_CHECK_TIMER(sk);
 566
 567 out_unlock:
 568         bh_unlock_sock(sk);
 569         sock_put(sk);
 570 }
 571
 572 /*
 573  *      Timer for listening sockets
 574  */
 575
 576 static void tcp_synack_timer(struct sock *sk)
 577 {
 578         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 579         struct tcp_listen_opt *lopt = tp->listen_opt;
 580         int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
 581         int thresh = max_retries;
 582         unsigned long now = jiffies;
 583         struct open_request **reqp, *req;
 584         int i, budget;
 585
 586         if (lopt == NULL || lopt->qlen == 0)
 587                 return;
 588
 589         /* Normally all the openreqs are young and become mature
 590          * (i.e. converted to established socket) for first timeout.
 591          * If synack was not acknowledged for 3 seconds, it means
 592          * one of the following things: synack was lost, ack was lost,
 593          * rtt is high or nobody planned to ack (i.e. synflood).
 594          * When server is a bit loaded, queue is populated with old
 595          * open requests, reducing effective size of queue.
 596          * When server is well loaded, queue size reduces to zero
 597          * after several minutes of work. It is not synflood,
 598          * it is normal operation. The solution is pruning
 599          * too old entries overriding normal timeout, when
 600          * situation becomes dangerous.
 601          *
 602          * Essentially, we reserve half of room for young
 603          * embrions; and abort old ones without pity, if old
 604          * ones are about to clog our table.
 605          */
 606         if (lopt->qlen>>(lopt->max_qlen_log-1)) {
 607                 int young = (lopt->qlen_young<<1);
 608
 609                 while (thresh > 2) {
 610                         if (lopt->qlen < young)
 611                                 break;
 612                         thresh--;
 613                         young <<= 1;
 614                 }
 615         }
 616
 617         if (tp->defer_accept)
 618                 max_retries = tp->defer_accept;
 619
 620         budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
 621         i = lopt->clock_hand;
 622
 623         do {
 624                 reqp=&lopt->syn_table[i];
 625                 while ((req = *reqp) != NULL) {
 626                         if ((long)(now - req->expires) >= 0) {
 627                                 if ((req->retrans < thresh ||
 628                                      (req->acked && req->retrans < max_retries))
 629                                     && !req->class->rtx_syn_ack(sk, req, NULL)) {
 630                                         unsigned long timeo;
 631
 632                                         if (req->retrans++ == 0)
 633                                                 lopt->qlen_young--;
 634                                         timeo = min((TCP_TIMEOUT_INIT << req->retrans),
 635                                                     TCP_RTO_MAX);
 636                                         req->expires = now + timeo;
 637                                         reqp = &req->dl_next;
 638                                         continue;
 639                                 }
 640
 641                                 /* Drop this request */
 642                                 write_lock(&tp->syn_wait_lock);
 643                                 *reqp = req->dl_next;
 644                                 write_unlock(&tp->syn_wait_lock);
 645                                 lopt->qlen--;
 646                                 if (req->retrans == 0)
 647                                         lopt->qlen_young--;
 648                                 tcp_openreq_free(req);
 649                         }
 650                         reqp = &req->dl_next;
 651                 }
 652
 653                 i = (i+1)&(TCP_SYNQ_HSIZE-1);
 654
 655         } while (--budget > 0);
 656
 657         lopt->clock_hand = i;
 658
 659         if (lopt->qlen)
 660                 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
 661 }
 662
 663 void tcp_delete_keepalive_timer (struct sock *sk)
 664 {
 665         spin_lock_bh(&sk->timer_lock);
 666         if (sk->timer.prev && del_timer (&sk->timer))
 667                 __sock_put(sk);
 668         spin_unlock_bh(&sk->timer_lock);
 669 }
 670
 671 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
 672 {
 673         spin_lock_bh(&sk->timer_lock);
 674         if(!sk->timer.prev || !del_timer(&sk->timer))
 675                 sock_hold(sk);
 676         mod_timer(&sk->timer, jiffies+len);
 677         spin_unlock_bh(&sk->timer_lock);
 678 }
 679
 680 void tcp_set_keepalive(struct sock *sk, int val)
 681 {
 682         if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
 683                 return;
 684
 685         if (val && !sk->keepopen)
 686                 tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
 687         else if (!val)
 688                 tcp_delete_keepalive_timer(sk);
 689 }
 690
 691
 692 static void tcp_keepalive_timer (unsigned long data)
 693 {
 694         struct sock *sk = (struct sock *) data;
 695         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 696         __u32 elapsed;
 697
 698         /* Only process if socket is not in use. */
 699         bh_lock_sock(sk);
 700         if (sk->lock.users) {
 701                 /* Try again later. */
 702                 tcp_reset_keepalive_timer (sk, HZ/20);
 703                 goto out;
 704         }
 705
 706         if (sk->state == TCP_LISTEN) {
 707                 tcp_synack_timer(sk);
 708                 goto out;
 709         }
 710
 711         if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
 712                 if (tp->linger2 >= 0) {
 713                         int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
 714
 715                         if (tmo > 0) {
 716                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 717                                 goto out;
 718                         }
 719                 }
 720                 tcp_send_active_reset(sk, GFP_ATOMIC);
 721                 goto death;
 722         }
 723
 724         if (!sk->keepopen || sk->state == TCP_CLOSE)
 725                 goto out;
 726
 727         elapsed = keepalive_time_when(tp);
 728
 729         /* It is alive without keepalive 8) */
 730         if (tp->packets_out || tp->send_head)
 731                 goto resched;
 732
 733         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 734
 735         if (elapsed >= keepalive_time_when(tp)) {
 736                 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 737                      (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 738                         tcp_send_active_reset(sk, GFP_ATOMIC);
 739                         tcp_write_err(sk);
 740                         goto out;
 741                 }
 742                 if (tcp_write_wakeup(sk) <= 0) {
 743                         tp->probes_out++;
 744                         elapsed = keepalive_intvl_when(tp);
 745                 } else {
 746                         /* If keepalive was lost due to local congestion,
 747                          * try harder.
 748                          */
 749                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 750                 }
 751         } else {
 752                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 753                 elapsed = keepalive_time_when(tp) - elapsed;
 754         }
 755
 756         TCP_CHECK_TIMER(sk);
 757
 758 resched:
 759         tcp_reset_keepalive_timer (sk, elapsed);
 760         goto out;
 761
 762 death:
 763         tcp_done(sk);
 764
 765 out:
 766         bh_unlock_sock(sk);
 767         sock_put(sk);
 768 }