net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 = TCP_RETR1;
  32 int sysctl_tcp_retries2 = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 void tcp_init_xmit_timers(struct sock *sk)
  40 {
  41         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
  42                                   &tcp_keepalive_timer);
  43 }
  44
  45 EXPORT_SYMBOL(tcp_init_xmit_timers);
  46
  47 static void tcp_write_err(struct sock *sk)
  48 {
  49         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  50         sk->sk_error_report(sk);
  51
  52         tcp_done(sk);
  53         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  54 }
  55
  56 /* Do not allow orphaned sockets to eat all our resources.
  57  * This is direct violation of TCP specs, but it is required
  58  * to prevent DoS attacks. It is called when a retransmission timeout
  59  * or zero probe timeout occurs on orphaned socket.
  60  *
  61  * Criterium is still not confirmed experimentally and may change.
  62  * We kill the socket, if:
  63  * 1. If number of orphaned sockets exceeds an administratively configured
  64  *    limit.
  65  * 2. If we have strong memory pressure.
  66  */
  67 static int tcp_out_of_resources(struct sock *sk, int do_reset)
  68 {
  69         struct tcp_sock *tp = tcp_sk(sk);
  70         int orphans = atomic_read(&tcp_orphan_count);
  71
  72         /* If peer does not open window for long time, or did not transmit
  73          * anything for long time, penalize it. */
  74         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
  75                 orphans <<= 1;
  76
  77         /* If some dubious ICMP arrived, penalize even more. */
  78         if (sk->sk_err_soft)
  79                 orphans <<= 1;
  80
  81         if (orphans >= sysctl_tcp_max_orphans ||
  82             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
  83              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
  84                 if (net_ratelimit())
  85                         printk(KERN_INFO "Out of socket memory\n");
  86
  87                 /* Catch exceptional cases, when connection requires reset.
  88                  *      1. Last segment was sent recently. */
  89                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
  90                     /*  2. Window is closed. */
  91                     (!tp->snd_wnd && !tp->packets_out))
  92                         do_reset = 1;
  93                 if (do_reset)
  94                         tcp_send_active_reset(sk, GFP_ATOMIC);
  95                 tcp_done(sk);
  96                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
  97                 return 1;
  98         }
  99         return 0;
 100 }
 101
 102 /* Calculate maximal number or retries on an orphaned socket. */
 103 static int tcp_orphan_retries(struct sock *sk, int alive)
 104 {
 105         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 106
 107         /* We know from an ICMP that something is wrong. */
 108         if (sk->sk_err_soft && !alive)
 109                 retries = 0;
 110
 111         /* However, if socket sent something recently, select some safe
 112          * number of retries. 8 corresponds to >100 seconds with minimal
 113          * RTO of 200msec. */
 114         if (retries == 0 && alive)
 115                 retries = 8;
 116         return retries;
 117 }
 118
 119 /* A write timeout has occurred. Process the after effects. */
 120 static int tcp_write_timeout(struct sock *sk)
 121 {
 122         const struct inet_connection_sock *icsk = inet_csk(sk);
 123         int retry_until;
 124
 125         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 126                 if (icsk->icsk_retransmits)
 127                         dst_negative_advice(&sk->sk_dst_cache);
 128                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 129         } else {
 130                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 131                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 132                            hole detection. :-(
 133
 134                            It is place to make it. It is not made. I do not want
 135                            to make it. It is disguisting. It does not work in any
 136                            case. Let me to cite the same draft, which requires for
 137                            us to implement this:
 138
 139    "The one security concern raised by this memo is that ICMP black holes
 140    are often caused by over-zealous security administrators who block
 141    all ICMP messages.  It is vitally important that those who design and
 142    deploy security systems understand the impact of strict filtering on
 143    upper-layer protocols.  The safest web site in the world is worthless
 144    if most TCP implementations cannot transfer data from it.  It would
 145    be far nicer to have all of the black holes fixed rather than fixing
 146    all of the TCP implementations."
 147
 148                            Golden words :-).
 149                    */
 150
 151                         dst_negative_advice(&sk->sk_dst_cache);
 152                 }
 153
 154                 retry_until = sysctl_tcp_retries2;
 155                 if (sock_flag(sk, SOCK_DEAD)) {
 156                         const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 157
 158                         retry_until = tcp_orphan_retries(sk, alive);
 159
 160                         if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 161                                 return 1;
 162                 }
 163         }
 164
 165         if (icsk->icsk_retransmits >= retry_until) {
 166                 /* Has it gone just too far? */
 167                 tcp_write_err(sk);
 168                 return 1;
 169         }
 170         return 0;
 171 }
 172
 173 static void tcp_delack_timer(unsigned long data)
 174 {
 175         struct sock *sk = (struct sock*)data;
 176         struct tcp_sock *tp = tcp_sk(sk);
 177         struct inet_connection_sock *icsk = inet_csk(sk);
 178
 179         bh_lock_sock(sk);
 180         if (sock_owned_by_user(sk)) {
 181                 /* Try again later. */
 182                 icsk->icsk_ack.blocked = 1;
 183                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 184                 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 185                 goto out_unlock;
 186         }
 187
 188         sk_stream_mem_reclaim(sk);
 189
 190         if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 191                 goto out;
 192
 193         if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 194                 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 195                 goto out;
 196         }
 197         icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 198
 199         if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 200                 struct sk_buff *skb;
 201
 202                 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 203
 204                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 205                         sk->sk_backlog_rcv(sk, skb);
 206
 207                 tp->ucopy.memory = 0;
 208         }
 209
 210         if (inet_csk_ack_scheduled(sk)) {
 211                 if (!icsk->icsk_ack.pingpong) {
 212                         /* Delayed ACK missed: inflate ATO. */
 213                         icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 214                 } else {
 215                         /* Delayed ACK missed: leave pingpong mode and
 216                          * deflate ATO.
 217                          */
 218                         icsk->icsk_ack.pingpong = 0;
 219                         icsk->icsk_ack.ato      = TCP_ATO_MIN;
 220                 }
 221                 tcp_send_ack(sk);
 222                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 223         }
 224         TCP_CHECK_TIMER(sk);
 225
 226 out:
 227         if (tcp_memory_pressure)
 228                 sk_stream_mem_reclaim(sk);
 229 out_unlock:
 230         bh_unlock_sock(sk);
 231         sock_put(sk);
 232 }
 233
 234 static void tcp_probe_timer(struct sock *sk)
 235 {
 236         struct tcp_sock *tp = tcp_sk(sk);
 237         int max_probes;
 238
 239         if (tp->packets_out || !sk->sk_send_head) {
 240                 tp->probes_out = 0;
 241                 return;
 242         }
 243
 244         /* *WARNING* RFC 1122 forbids this
 245          *
 246          * It doesn't AFAIK, because we kill the retransmit timer -AK
 247          *
 248          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 249          * this behaviour in Solaris down as a bug fix. [AC]
 250          *
 251          * Let me to explain. probes_out is zeroed by incoming ACKs
 252          * even if they advertise zero window. Hence, connection is killed only
 253          * if we received no ACKs for normal connection timeout. It is not killed
 254          * only because window stays zero for some time, window may be zero
 255          * until armageddon and even later. We are in full accordance
 256          * with RFCs, only probe timer combines both retransmission timeout
 257          * and probe timeout in one bottle.                             --ANK
 258          */
 259         max_probes = sysctl_tcp_retries2;
 260
 261         if (sock_flag(sk, SOCK_DEAD)) {
 262                 const struct inet_connection_sock *icsk = inet_csk(sk);
 263                 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 264
 265                 max_probes = tcp_orphan_retries(sk, alive);
 266
 267                 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
 268                         return;
 269         }
 270
 271         if (tp->probes_out > max_probes) {
 272                 tcp_write_err(sk);
 273         } else {
 274                 /* Only send another probe if we didn't close things up. */
 275                 tcp_send_probe0(sk);
 276         }
 277 }
 278
 279 /*
 280  *      The TCP retransmit timer.
 281  */
 282
 283 static void tcp_retransmit_timer(struct sock *sk)
 284 {
 285         struct tcp_sock *tp = tcp_sk(sk);
 286         struct inet_connection_sock *icsk = inet_csk(sk);
 287
 288         if (!tp->packets_out)
 289                 goto out;
 290
 291         BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
 292
 293         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 294             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 295                 /* Receiver dastardly shrinks window. Our retransmits
 296                  * become zero probes, but we should not timeout this
 297                  * connection. If the socket is an orphan, time it out,
 298                  * we cannot allow such beasts to hang infinitely.
 299                  */
 300 #ifdef TCP_DEBUG
 301                 if (net_ratelimit()) {
 302                         struct inet_sock *inet = inet_sk(sk);
 303                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 304                                NIPQUAD(inet->daddr), htons(inet->dport),
 305                                inet->num, tp->snd_una, tp->snd_nxt);
 306                 }
 307 #endif
 308                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 309                         tcp_write_err(sk);
 310                         goto out;
 311                 }
 312                 tcp_enter_loss(sk, 0);
 313                 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
 314                 __sk_dst_reset(sk);
 315                 goto out_reset_timer;
 316         }
 317
 318         if (tcp_write_timeout(sk))
 319                 goto out;
 320
 321         if (icsk->icsk_retransmits == 0) {
 322                 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
 323                         if (tp->rx_opt.sack_ok) {
 324                                 if (tp->ca_state == TCP_CA_Recovery)
 325                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 326                                 else
 327                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 328                         } else {
 329                                 if (tp->ca_state == TCP_CA_Recovery)
 330                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 331                                 else
 332                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 333                         }
 334                 } else if (tp->ca_state == TCP_CA_Loss) {
 335                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 336                 } else {
 337                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 338                 }
 339         }
 340
 341         if (tcp_use_frto(sk)) {
 342                 tcp_enter_frto(sk);
 343         } else {
 344                 tcp_enter_loss(sk, 0);
 345         }
 346
 347         if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
 348                 /* Retransmission failed because of local congestion,
 349                  * do not backoff.
 350                  */
 351                 if (!icsk->icsk_retransmits)
 352                         icsk->icsk_retransmits = 1;
 353                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 354                                           min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
 355                                           TCP_RTO_MAX);
 356                 goto out;
 357         }
 358
 359         /* Increase the timeout each time we retransmit.  Note that
 360          * we do not increase the rtt estimate.  rto is initialized
 361          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 362          * that doubling rto each time is the least we can get away with.
 363          * In KA9Q, Karn uses this for the first few times, and then
 364          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 365          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 366          * defined in the protocol as the maximum possible RTT.  I guess
 367          * we'll have to use something other than TCP to talk to the
 368          * University of Mars.
 369          *
 370          * PAWS allows us longer timeouts and large windows, so once
 371          * implemented ftp to mars will work nicely. We will have to fix
 372          * the 120 second clamps though!
 373          */
 374         icsk->icsk_backoff++;
 375         icsk->icsk_retransmits++;
 376
 377 out_reset_timer:
 378         icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 379         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 380         if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 381                 __sk_dst_reset(sk);
 382
 383 out:;
 384 }
 385
 386 static void tcp_write_timer(unsigned long data)
 387 {
 388         struct sock *sk = (struct sock*)data;
 389         struct inet_connection_sock *icsk = inet_csk(sk);
 390         int event;
 391
 392         bh_lock_sock(sk);
 393         if (sock_owned_by_user(sk)) {
 394                 /* Try again later */
 395                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 396                 goto out_unlock;
 397         }
 398
 399         if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 400                 goto out;
 401
 402         if (time_after(icsk->icsk_timeout, jiffies)) {
 403                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 404                 goto out;
 405         }
 406
 407         event = icsk->icsk_pending;
 408         icsk->icsk_pending = 0;
 409
 410         switch (event) {
 411         case ICSK_TIME_RETRANS:
 412                 tcp_retransmit_timer(sk);
 413                 break;
 414         case ICSK_TIME_PROBE0:
 415                 tcp_probe_timer(sk);
 416                 break;
 417         }
 418         TCP_CHECK_TIMER(sk);
 419
 420 out:
 421         sk_stream_mem_reclaim(sk);
 422 out_unlock:
 423         bh_unlock_sock(sk);
 424         sock_put(sk);
 425 }
 426
 427 /*
 428  *      Timer for listening sockets
 429  */
 430
 431 static void tcp_synack_timer(struct sock *sk)
 432 {
 433         inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
 434                                    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 435 }
 436
 437 void tcp_set_keepalive(struct sock *sk, int val)
 438 {
 439         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 440                 return;
 441
 442         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 443                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 444         else if (!val)
 445                 inet_csk_delete_keepalive_timer(sk);
 446 }
 447
 448
 449 static void tcp_keepalive_timer (unsigned long data)
 450 {
 451         struct sock *sk = (struct sock *) data;
 452         struct tcp_sock *tp = tcp_sk(sk);
 453         __u32 elapsed;
 454
 455         /* Only process if socket is not in use. */
 456         bh_lock_sock(sk);
 457         if (sock_owned_by_user(sk)) {
 458                 /* Try again later. */
 459                 inet_csk_reset_keepalive_timer (sk, HZ/20);
 460                 goto out;
 461         }
 462
 463         if (sk->sk_state == TCP_LISTEN) {
 464                 tcp_synack_timer(sk);
 465                 goto out;
 466         }
 467
 468         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 469                 if (tp->linger2 >= 0) {
 470                         const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 471
 472                         if (tmo > 0) {
 473                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 474                                 goto out;
 475                         }
 476                 }
 477                 tcp_send_active_reset(sk, GFP_ATOMIC);
 478                 goto death;
 479         }
 480
 481         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 482                 goto out;
 483
 484         elapsed = keepalive_time_when(tp);
 485
 486         /* It is alive without keepalive 8) */
 487         if (tp->packets_out || sk->sk_send_head)
 488                 goto resched;
 489
 490         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 491
 492         if (elapsed >= keepalive_time_when(tp)) {
 493                 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 494                      (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 495                         tcp_send_active_reset(sk, GFP_ATOMIC);
 496                         tcp_write_err(sk);
 497                         goto out;
 498                 }
 499                 if (tcp_write_wakeup(sk) <= 0) {
 500                         tp->probes_out++;
 501                         elapsed = keepalive_intvl_when(tp);
 502                 } else {
 503                         /* If keepalive was lost due to local congestion,
 504                          * try harder.
 505                          */
 506                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 507                 }
 508         } else {
 509                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 510                 elapsed = keepalive_time_when(tp) - elapsed;
 511         }
 512
 513         TCP_CHECK_TIMER(sk);
 514         sk_stream_mem_reclaim(sk);
 515
 516 resched:
 517         inet_csk_reset_keepalive_timer (sk, elapsed);
 518         goto out;
 519
 520 death:
 521         tcp_done(sk);
 522
 523 out:
 524         bh_unlock_sock(sk);
 525         sock_put(sk);
 526 }