net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
  32 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries __read_mostly;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 void tcp_init_xmit_timers(struct sock *sk)
  40 {
  41         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
  42                                   &tcp_keepalive_timer);
  43 }
  44
  45 EXPORT_SYMBOL(tcp_init_xmit_timers);
  46
  47 static void tcp_write_err(struct sock *sk)
  48 {
  49         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  50         sk->sk_error_report(sk);
  51
  52         tcp_done(sk);
  53         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  54 }
  55
  56 /* Do not allow orphaned sockets to eat all our resources.
  57  * This is direct violation of TCP specs, but it is required
  58  * to prevent DoS attacks. It is called when a retransmission timeout
  59  * or zero probe timeout occurs on orphaned socket.
  60  *
  61  * Criteria is still not confirmed experimentally and may change.
  62  * We kill the socket, if:
  63  * 1. If number of orphaned sockets exceeds an administratively configured
  64  *    limit.
  65  * 2. If we have strong memory pressure.
  66  */
  67 static int tcp_out_of_resources(struct sock *sk, int do_reset)
  68 {
  69         struct tcp_sock *tp = tcp_sk(sk);
  70         int orphans = atomic_read(&tcp_orphan_count);
  71
  72         /* If peer does not open window for long time, or did not transmit
  73          * anything for long time, penalize it. */
  74         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
  75                 orphans <<= 1;
  76
  77         /* If some dubious ICMP arrived, penalize even more. */
  78         if (sk->sk_err_soft)
  79                 orphans <<= 1;
  80
  81         if (orphans >= sysctl_tcp_max_orphans ||
  82             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
  83              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
  84                 if (net_ratelimit())
  85                         printk(KERN_INFO "Out of socket memory\n");
  86
  87                 /* Catch exceptional cases, when connection requires reset.
  88                  *      1. Last segment was sent recently. */
  89                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
  90                     /*  2. Window is closed. */
  91                     (!tp->snd_wnd && !tp->packets_out))
  92                         do_reset = 1;
  93                 if (do_reset)
  94                         tcp_send_active_reset(sk, GFP_ATOMIC);
  95                 tcp_done(sk);
  96                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
  97                 return 1;
  98         }
  99         return 0;
 100 }
 101
 102 /* Calculate maximal number or retries on an orphaned socket. */
 103 static int tcp_orphan_retries(struct sock *sk, int alive)
 104 {
 105         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 106
 107         /* We know from an ICMP that something is wrong. */
 108         if (sk->sk_err_soft && !alive)
 109                 retries = 0;
 110
 111         /* However, if socket sent something recently, select some safe
 112          * number of retries. 8 corresponds to >100 seconds with minimal
 113          * RTO of 200msec. */
 114         if (retries == 0 && alive)
 115                 retries = 8;
 116         return retries;
 117 }
 118
 119 /* A write timeout has occurred. Process the after effects. */
 120 static int tcp_write_timeout(struct sock *sk)
 121 {
 122         struct inet_connection_sock *icsk = inet_csk(sk);
 123         struct tcp_sock *tp = tcp_sk(sk);
 124         int retry_until;
 125         int mss;
 126
 127         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 128                 if (icsk->icsk_retransmits)
 129                         dst_negative_advice(&sk->sk_dst_cache);
 130                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 131         } else {
 132                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 133                         /* Black hole detection */
 134                         if (sysctl_tcp_mtu_probing) {
 135                                 if (!icsk->icsk_mtup.enabled) {
 136                                         icsk->icsk_mtup.enabled = 1;
 137                                         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 138                                 } else {
 139                                         mss = min(sysctl_tcp_base_mss,
 140                                                   tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
 141                                         mss = max(mss, 68 - tp->tcp_header_len);
 142                                         icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
 143                                         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 144                                 }
 145                         }
 146
 147                         dst_negative_advice(&sk->sk_dst_cache);
 148                 }
 149
 150                 retry_until = sysctl_tcp_retries2;
 151                 if (sock_flag(sk, SOCK_DEAD)) {
 152                         const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 153
 154                         retry_until = tcp_orphan_retries(sk, alive);
 155
 156                         if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 157                                 return 1;
 158                 }
 159         }
 160
 161         if (icsk->icsk_retransmits >= retry_until) {
 162                 /* Has it gone just too far? */
 163                 tcp_write_err(sk);
 164                 return 1;
 165         }
 166         return 0;
 167 }
 168
 169 static void tcp_delack_timer(unsigned long data)
 170 {
 171         struct sock *sk = (struct sock*)data;
 172         struct tcp_sock *tp = tcp_sk(sk);
 173         struct inet_connection_sock *icsk = inet_csk(sk);
 174
 175         bh_lock_sock(sk);
 176         if (sock_owned_by_user(sk)) {
 177                 /* Try again later. */
 178                 icsk->icsk_ack.blocked = 1;
 179                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 180                 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 181                 goto out_unlock;
 182         }
 183
 184         sk_stream_mem_reclaim(sk);
 185
 186         if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 187                 goto out;
 188
 189         if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 190                 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 191                 goto out;
 192         }
 193         icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 194
 195         if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 196                 struct sk_buff *skb;
 197
 198                 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 199
 200                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 201                         sk->sk_backlog_rcv(sk, skb);
 202
 203                 tp->ucopy.memory = 0;
 204         }
 205
 206         if (inet_csk_ack_scheduled(sk)) {
 207                 if (!icsk->icsk_ack.pingpong) {
 208                         /* Delayed ACK missed: inflate ATO. */
 209                         icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 210                 } else {
 211                         /* Delayed ACK missed: leave pingpong mode and
 212                          * deflate ATO.
 213                          */
 214                         icsk->icsk_ack.pingpong = 0;
 215                         icsk->icsk_ack.ato      = TCP_ATO_MIN;
 216                 }
 217                 tcp_send_ack(sk);
 218                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 219         }
 220         TCP_CHECK_TIMER(sk);
 221
 222 out:
 223         if (tcp_memory_pressure)
 224                 sk_stream_mem_reclaim(sk);
 225 out_unlock:
 226         bh_unlock_sock(sk);
 227         sock_put(sk);
 228 }
 229
 230 static void tcp_probe_timer(struct sock *sk)
 231 {
 232         struct inet_connection_sock *icsk = inet_csk(sk);
 233         struct tcp_sock *tp = tcp_sk(sk);
 234         int max_probes;
 235
 236         if (tp->packets_out || !tcp_send_head(sk)) {
 237                 icsk->icsk_probes_out = 0;
 238                 return;
 239         }
 240
 241         /* *WARNING* RFC 1122 forbids this
 242          *
 243          * It doesn't AFAIK, because we kill the retransmit timer -AK
 244          *
 245          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 246          * this behaviour in Solaris down as a bug fix. [AC]
 247          *
 248          * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
 249          * even if they advertise zero window. Hence, connection is killed only
 250          * if we received no ACKs for normal connection timeout. It is not killed
 251          * only because window stays zero for some time, window may be zero
 252          * until armageddon and even later. We are in full accordance
 253          * with RFCs, only probe timer combines both retransmission timeout
 254          * and probe timeout in one bottle.                             --ANK
 255          */
 256         max_probes = sysctl_tcp_retries2;
 257
 258         if (sock_flag(sk, SOCK_DEAD)) {
 259                 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 260
 261                 max_probes = tcp_orphan_retries(sk, alive);
 262
 263                 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
 264                         return;
 265         }
 266
 267         if (icsk->icsk_probes_out > max_probes) {
 268                 tcp_write_err(sk);
 269         } else {
 270                 /* Only send another probe if we didn't close things up. */
 271                 tcp_send_probe0(sk);
 272         }
 273 }
 274
 275 /*
 276  *      The TCP retransmit timer.
 277  */
 278
 279 static void tcp_retransmit_timer(struct sock *sk)
 280 {
 281         struct tcp_sock *tp = tcp_sk(sk);
 282         struct inet_connection_sock *icsk = inet_csk(sk);
 283
 284         if (!tp->packets_out)
 285                 goto out;
 286
 287         BUG_TRAP(!tcp_write_queue_empty(sk));
 288
 289         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 290             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 291                 /* Receiver dastardly shrinks window. Our retransmits
 292                  * become zero probes, but we should not timeout this
 293                  * connection. If the socket is an orphan, time it out,
 294                  * we cannot allow such beasts to hang infinitely.
 295                  */
 296 #ifdef TCP_DEBUG
 297                 if (net_ratelimit()) {
 298                         struct inet_sock *inet = inet_sk(sk);
 299                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 300                                NIPQUAD(inet->daddr), ntohs(inet->dport),
 301                                inet->num, tp->snd_una, tp->snd_nxt);
 302                 }
 303 #endif
 304                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 305                         tcp_write_err(sk);
 306                         goto out;
 307                 }
 308                 tcp_enter_loss(sk, 0);
 309                 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
 310                 __sk_dst_reset(sk);
 311                 goto out_reset_timer;
 312         }
 313
 314         if (tcp_write_timeout(sk))
 315                 goto out;
 316
 317         if (icsk->icsk_retransmits == 0) {
 318                 if (icsk->icsk_ca_state == TCP_CA_Disorder ||
 319                     icsk->icsk_ca_state == TCP_CA_Recovery) {
 320                         if (tp->rx_opt.sack_ok) {
 321                                 if (icsk->icsk_ca_state == TCP_CA_Recovery)
 322                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 323                                 else
 324                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 325                         } else {
 326                                 if (icsk->icsk_ca_state == TCP_CA_Recovery)
 327                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 328                                 else
 329                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 330                         }
 331                 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 332                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 333                 } else {
 334                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 335                 }
 336         }
 337
 338         if (tcp_use_frto(sk)) {
 339                 tcp_enter_frto(sk);
 340         } else {
 341                 tcp_enter_loss(sk, 0);
 342         }
 343
 344         if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
 345                 /* Retransmission failed because of local congestion,
 346                  * do not backoff.
 347                  */
 348                 if (!icsk->icsk_retransmits)
 349                         icsk->icsk_retransmits = 1;
 350                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 351                                           min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
 352                                           TCP_RTO_MAX);
 353                 goto out;
 354         }
 355
 356         /* Increase the timeout each time we retransmit.  Note that
 357          * we do not increase the rtt estimate.  rto is initialized
 358          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 359          * that doubling rto each time is the least we can get away with.
 360          * In KA9Q, Karn uses this for the first few times, and then
 361          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 362          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 363          * defined in the protocol as the maximum possible RTT.  I guess
 364          * we'll have to use something other than TCP to talk to the
 365          * University of Mars.
 366          *
 367          * PAWS allows us longer timeouts and large windows, so once
 368          * implemented ftp to mars will work nicely. We will have to fix
 369          * the 120 second clamps though!
 370          */
 371         icsk->icsk_backoff++;
 372         icsk->icsk_retransmits++;
 373
 374 out_reset_timer:
 375         icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 376         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 377         if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 378                 __sk_dst_reset(sk);
 379
 380 out:;
 381 }
 382
 383 static void tcp_write_timer(unsigned long data)
 384 {
 385         struct sock *sk = (struct sock*)data;
 386         struct inet_connection_sock *icsk = inet_csk(sk);
 387         int event;
 388
 389         bh_lock_sock(sk);
 390         if (sock_owned_by_user(sk)) {
 391                 /* Try again later */
 392                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 393                 goto out_unlock;
 394         }
 395
 396         if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 397                 goto out;
 398
 399         if (time_after(icsk->icsk_timeout, jiffies)) {
 400                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 401                 goto out;
 402         }
 403
 404         event = icsk->icsk_pending;
 405         icsk->icsk_pending = 0;
 406
 407         switch (event) {
 408         case ICSK_TIME_RETRANS:
 409                 tcp_retransmit_timer(sk);
 410                 break;
 411         case ICSK_TIME_PROBE0:
 412                 tcp_probe_timer(sk);
 413                 break;
 414         }
 415         TCP_CHECK_TIMER(sk);
 416
 417 out:
 418         sk_stream_mem_reclaim(sk);
 419 out_unlock:
 420         bh_unlock_sock(sk);
 421         sock_put(sk);
 422 }
 423
 424 /*
 425  *      Timer for listening sockets
 426  */
 427
 428 static void tcp_synack_timer(struct sock *sk)
 429 {
 430         inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
 431                                    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 432 }
 433
 434 void tcp_set_keepalive(struct sock *sk, int val)
 435 {
 436         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 437                 return;
 438
 439         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 440                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 441         else if (!val)
 442                 inet_csk_delete_keepalive_timer(sk);
 443 }
 444
 445
 446 static void tcp_keepalive_timer (unsigned long data)
 447 {
 448         struct sock *sk = (struct sock *) data;
 449         struct inet_connection_sock *icsk = inet_csk(sk);
 450         struct tcp_sock *tp = tcp_sk(sk);
 451         __u32 elapsed;
 452
 453         /* Only process if socket is not in use. */
 454         bh_lock_sock(sk);
 455         if (sock_owned_by_user(sk)) {
 456                 /* Try again later. */
 457                 inet_csk_reset_keepalive_timer (sk, HZ/20);
 458                 goto out;
 459         }
 460
 461         if (sk->sk_state == TCP_LISTEN) {
 462                 tcp_synack_timer(sk);
 463                 goto out;
 464         }
 465
 466         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 467                 if (tp->linger2 >= 0) {
 468                         const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 469
 470                         if (tmo > 0) {
 471                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 472                                 goto out;
 473                         }
 474                 }
 475                 tcp_send_active_reset(sk, GFP_ATOMIC);
 476                 goto death;
 477         }
 478
 479         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 480                 goto out;
 481
 482         elapsed = keepalive_time_when(tp);
 483
 484         /* It is alive without keepalive 8) */
 485         if (tp->packets_out || tcp_send_head(sk))
 486                 goto resched;
 487
 488         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 489
 490         if (elapsed >= keepalive_time_when(tp)) {
 491                 if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
 492                      (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
 493                         tcp_send_active_reset(sk, GFP_ATOMIC);
 494                         tcp_write_err(sk);
 495                         goto out;
 496                 }
 497                 if (tcp_write_wakeup(sk) <= 0) {
 498                         icsk->icsk_probes_out++;
 499                         elapsed = keepalive_intvl_when(tp);
 500                 } else {
 501                         /* If keepalive was lost due to local congestion,
 502                          * try harder.
 503                          */
 504                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 505                 }
 506         } else {
 507                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 508                 elapsed = keepalive_time_when(tp) - elapsed;
 509         }
 510
 511         TCP_CHECK_TIMER(sk);
 512         sk_stream_mem_reclaim(sk);
 513
 514 resched:
 515         inet_csk_reset_keepalive_timer (sk, elapsed);
 516         goto out;
 517
 518 death:
 519         tcp_done(sk);
 520
 521 out:
 522         bh_unlock_sock(sk);
 523         sock_put(sk);
 524 }