net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/sysctl.h>
  26 #include <linux/workqueue.h>
  27 #include <net/tcp.h>
  28 #include <net/inet_common.h>
  29 #include <net/xfrm.h>
  30
  31 #ifdef CONFIG_SYSCTL
  32 #define SYNC_INIT 0 /* let the user enable it */
  33 #else
  34 #define SYNC_INIT 1
  35 #endif
  36
  37 int sysctl_tcp_tw_recycle;
  38 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  39
  40 int sysctl_tcp_syncookies = SYNC_INIT;
  41 int sysctl_tcp_abort_on_overflow;
  42
  43 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  44 {
  45         if (seq == s_win)
  46                 return 1;
  47         if (after(end_seq, s_win) && before(seq, e_win))
  48                 return 1;
  49         return (seq == e_win && seq == end_seq);
  50 }
  51
  52 /* New-style handling of TIME_WAIT sockets. */
  53
  54 int tcp_tw_count;
  55
  56
  57 /* Must be called with locally disabled BHs. */
  58 static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  59 {
  60         struct tcp_ehash_bucket *ehead;
  61         struct tcp_bind_hashbucket *bhead;
  62         struct tcp_bind_bucket *tb;
  63
  64         /* Unlink from established hashes. */
  65         ehead = &tcp_ehash[tw->tw_hashent];
  66         write_lock(&ehead->lock);
  67         if (hlist_unhashed(&tw->tw_node)) {
  68                 write_unlock(&ehead->lock);
  69                 return;
  70         }
  71         __hlist_del(&tw->tw_node);
  72         sk_node_init(&tw->tw_node);
  73         write_unlock(&ehead->lock);
  74
  75         /* Disassociate with bind bucket. */
  76         bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
  77         spin_lock(&bhead->lock);
  78         tb = tw->tw_tb;
  79         __hlist_del(&tw->tw_bind_node);
  80         tw->tw_tb = NULL;
  81         tcp_bucket_destroy(tb);
  82         spin_unlock(&bhead->lock);
  83
  84 #ifdef INET_REFCNT_DEBUG
  85         if (atomic_read(&tw->tw_refcnt) != 1) {
  86                 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
  87                        atomic_read(&tw->tw_refcnt));
  88         }
  89 #endif
  90         tcp_tw_put(tw);
  91 }
  92
  93 /*
  94  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  95  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  96  *   (and, probably, tail of data) and one or more our ACKs are lost.
  97  * * What is TIME-WAIT timeout? It is associated with maximal packet
  98  *   lifetime in the internet, which results in wrong conclusion, that
  99  *   it is set to catch "old duplicate segments" wandering out of their path.
 100  *   It is not quite correct. This timeout is calculated so that it exceeds
 101  *   maximal retransmission timeout enough to allow to lose one (or more)
 102  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
 103  * * When TIME-WAIT socket receives RST, it means that another end
 104  *   finally closed and we are allowed to kill TIME-WAIT too.
 105  * * Second purpose of TIME-WAIT is catching old duplicate segments.
 106  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
 107  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
 108  * * If we invented some more clever way to catch duplicates
 109  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
 110  *
 111  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
 112  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
 113  * from the very beginning.
 114  *
 115  * NOTE. With recycling (and later with fin-wait-2) TW bucket
 116  * is _not_ stateless. It means, that strictly speaking we must
 117  * spinlock it. I do not want! Well, probability of misbehaviour
 118  * is ridiculously low and, seems, we could use some mb() tricks
 119  * to avoid misread sequence numbers, states etc.  --ANK
 120  */
 121 enum tcp_tw_status
 122 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 123                            struct tcphdr *th, unsigned len)
 124 {
 125         struct tcp_opt tp;
 126         int paws_reject = 0;
 127
 128         tp.saw_tstamp = 0;
 129         if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
 130                 tcp_parse_options(skb, &tp, 0);
 131
 132                 if (tp.saw_tstamp) {
 133                         tp.ts_recent       = tw->tw_ts_recent;
 134                         tp.ts_recent_stamp = tw->tw_ts_recent_stamp;
 135                         paws_reject = tcp_paws_check(&tp, th->rst);
 136                 }
 137         }
 138
 139         if (tw->tw_substate == TCP_FIN_WAIT2) {
 140                 /* Just repeat all the checks of tcp_rcv_state_process() */
 141
 142                 /* Out of window, send ACK */
 143                 if (paws_reject ||
 144                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 145                                    tw->tw_rcv_nxt,
 146                                    tw->tw_rcv_nxt + tw->tw_rcv_wnd))
 147                         return TCP_TW_ACK;
 148
 149                 if (th->rst)
 150                         goto kill;
 151
 152                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
 153                         goto kill_with_rst;
 154
 155                 /* Dup ACK? */
 156                 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
 157                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 158                         tcp_tw_put(tw);
 159                         return TCP_TW_SUCCESS;
 160                 }
 161
 162                 /* New data or FIN. If new data arrive after half-duplex close,
 163                  * reset.
 164                  */
 165                 if (!th->fin ||
 166                     TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
 167 kill_with_rst:
 168                         tcp_tw_deschedule(tw);
 169                         tcp_tw_put(tw);
 170                         return TCP_TW_RST;
 171                 }
 172
 173                 /* FIN arrived, enter true time-wait state. */
 174                 tw->tw_substate = TCP_TIME_WAIT;
 175                 tw->tw_rcv_nxt  = TCP_SKB_CB(skb)->end_seq;
 176                 if (tp.saw_tstamp) {
 177                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 178                         tw->tw_ts_recent        = tp.rcv_tsval;
 179                 }
 180
 181                 /* I am shamed, but failed to make it more elegant.
 182                  * Yes, it is direct reference to IP, which is impossible
 183                  * to generalize to IPv6. Taking into account that IPv6
 184                  * do not undertsnad recycling in any case, it not
 185                  * a big problem in practice. --ANK */
 186                 if (tw->tw_family == AF_INET &&
 187                     sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
 188                     tcp_v4_tw_remember_stamp(tw))
 189                         tcp_tw_schedule(tw, tw->tw_timeout);
 190                 else
 191                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 192                 return TCP_TW_ACK;
 193         }
 194
 195         /*
 196          *      Now real TIME-WAIT state.
 197          *
 198          *      RFC 1122:
 199          *      "When a connection is [...] on TIME-WAIT state [...]
 200          *      [a TCP] MAY accept a new SYN from the remote TCP to
 201          *      reopen the connection directly, if it:
 202          *
 203          *      (1)  assigns its initial sequence number for the new
 204          *      connection to be larger than the largest sequence
 205          *      number it used on the previous connection incarnation,
 206          *      and
 207          *
 208          *      (2)  returns to TIME-WAIT state if the SYN turns out
 209          *      to be an old duplicate".
 210          */
 211
 212         if (!paws_reject &&
 213             (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
 214              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 215                 /* In window segment, it may be only reset or bare ack. */
 216
 217                 if (th->rst) {
 218                         /* This is TIME_WAIT assasination, in two flavors.
 219                          * Oh well... nobody has a sufficient solution to this
 220                          * protocol bug yet.
 221                          */
 222                         if (sysctl_tcp_rfc1337 == 0) {
 223 kill:
 224                                 tcp_tw_deschedule(tw);
 225                                 tcp_tw_put(tw);
 226                                 return TCP_TW_SUCCESS;
 227                         }
 228                 }
 229                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 230
 231                 if (tp.saw_tstamp) {
 232                         tw->tw_ts_recent        = tp.rcv_tsval;
 233                         tw->tw_ts_recent_stamp  = xtime.tv_sec;
 234                 }
 235
 236                 tcp_tw_put(tw);
 237                 return TCP_TW_SUCCESS;
 238         }
 239
 240         /* Out of window segment.
 241
 242            All the segments are ACKed immediately.
 243
 244            The only exception is new SYN. We accept it, if it is
 245            not old duplicate and we are not in danger to be killed
 246            by delayed old duplicates. RFC check is that it has
 247            newer sequence number works at rates <40Mbit/sec.
 248            However, if paws works, it is reliable AND even more,
 249            we even may relax silly seq space cutoff.
 250
 251            RED-PEN: we violate main RFC requirement, if this SYN will appear
 252            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 253            we must return socket to time-wait state. It is not good,
 254            but not fatal yet.
 255          */
 256
 257         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 258             (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
 259              (tp.saw_tstamp && (s32)(tw->tw_ts_recent - tp.rcv_tsval) < 0))) {
 260                 u32 isn = tw->tw_snd_nxt + 65535 + 2;
 261                 if (isn == 0)
 262                         isn++;
 263                 TCP_SKB_CB(skb)->when = isn;
 264                 return TCP_TW_SYN;
 265         }
 266
 267         if (paws_reject)
 268                 NET_INC_STATS_BH(PAWSEstabRejected);
 269
 270         if(!th->rst) {
 271                 /* In this case we must reset the TIMEWAIT timer.
 272                  *
 273                  * If it is ACKless SYN it may be both old duplicate
 274                  * and new good SYN with random sequence number <rcv_nxt.
 275                  * Do not reschedule in the last case.
 276                  */
 277                 if (paws_reject || th->ack)
 278                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 279
 280                 /* Send ACK. Note, we do not put the bucket,
 281                  * it will be released by caller.
 282                  */
 283                 return TCP_TW_ACK;
 284         }
 285         tcp_tw_put(tw);
 286         return TCP_TW_SUCCESS;
 287 }
 288
 289 /* Enter the time wait state.  This is called with locally disabled BH.
 290  * Essentially we whip up a timewait bucket, copy the
 291  * relevant info into it from the SK, and mess with hash chains
 292  * and list linkage.
 293  */
 294 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 295 {
 296         struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
 297         struct tcp_bind_hashbucket *bhead;
 298
 299         /* Step 1: Put TW into bind hash. Original socket stays there too.
 300            Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
 301            binding cache, even if it is closed.
 302          */
 303         bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
 304         spin_lock(&bhead->lock);
 305         tw->tw_tb = tcp_sk(sk)->bind_hash;
 306         BUG_TRAP(tcp_sk(sk)->bind_hash);
 307         tw_add_bind_node(tw, &tw->tw_tb->owners);
 308         spin_unlock(&bhead->lock);
 309
 310         write_lock(&ehead->lock);
 311
 312         /* Step 2: Remove SK from established hash. */
 313         if (__sk_del_node_init(sk))
 314                 sock_prot_dec_use(sk->sk_prot);
 315
 316         /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
 317         tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
 318         atomic_inc(&tw->tw_refcnt);
 319
 320         write_unlock(&ehead->lock);
 321 }
 322
 323 /*
 324  * Move a socket to time-wait or dead fin-wait-2 state.
 325  */
 326 void tcp_time_wait(struct sock *sk, int state, int timeo)
 327 {
 328         struct tcp_tw_bucket *tw = NULL;
 329         struct tcp_opt *tp = tcp_sk(sk);
 330         int recycle_ok = 0;
 331
 332         if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
 333                 recycle_ok = tp->af_specific->remember_stamp(sk);
 334
 335         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 336                 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 337
 338         if(tw != NULL) {
 339                 struct inet_opt *inet = inet_sk(sk);
 340                 int rto = (tp->rto<<2) - (tp->rto>>1);
 341
 342                 /* Give us an identity. */
 343                 tw->tw_daddr            = inet->daddr;
 344                 tw->tw_rcv_saddr        = inet->rcv_saddr;
 345                 tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
 346                 tw->tw_num              = inet->num;
 347                 tw->tw_state            = TCP_TIME_WAIT;
 348                 tw->tw_substate         = state;
 349                 tw->tw_sport            = inet->sport;
 350                 tw->tw_dport            = inet->dport;
 351                 tw->tw_family           = sk->sk_family;
 352                 tw->tw_reuse            = sk->sk_reuse;
 353                 tw->tw_rcv_wscale       = tp->rcv_wscale;
 354                 atomic_set(&tw->tw_refcnt, 1);
 355
 356                 tw->tw_hashent          = sk->sk_hashent;
 357                 tw->tw_rcv_nxt          = tp->rcv_nxt;
 358                 tw->tw_snd_nxt          = tp->snd_nxt;
 359                 tw->tw_rcv_wnd          = tcp_receive_window(tp);
 360                 tw->tw_ts_recent        = tp->ts_recent;
 361                 tw->tw_ts_recent_stamp  = tp->ts_recent_stamp;
 362                 tw_dead_node_init(tw);
 363
 364 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 365                 if (tw->tw_family == PF_INET6) {
 366                         struct ipv6_pinfo *np = inet6_sk(sk);
 367
 368                         ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
 369                         ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 370                 }
 371 #endif
 372                 /* Linkage updates. */
 373                 __tcp_tw_hashdance(sk, tw);
 374
 375                 /* Get the TIME_WAIT timeout firing. */
 376                 if (timeo < rto)
 377                         timeo = rto;
 378
 379                 if (recycle_ok) {
 380                         tw->tw_timeout = rto;
 381                 } else {
 382                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 383                         if (state == TCP_TIME_WAIT)
 384                                 timeo = TCP_TIMEWAIT_LEN;
 385                 }
 386
 387                 tcp_tw_schedule(tw, timeo);
 388                 tcp_tw_put(tw);
 389         } else {
 390                 /* Sorry, if we're out of memory, just CLOSE this
 391                  * socket up.  We've got bigger problems than
 392                  * non-graceful socket closings.
 393                  */
 394                 if (net_ratelimit())
 395                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 396         }
 397
 398         tcp_update_metrics(sk);
 399         tcp_done(sk);
 400 }
 401
 402 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 403 static int tcp_tw_death_row_slot;
 404
 405 static void tcp_twkill(unsigned long);
 406
 407 /* TIME_WAIT reaping mechanism. */
 408 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 409 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 410
 411 #define TCP_TWKILL_QUOTA        100
 412
 413 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 414 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
 415 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 416 static void twkill_work(void *);
 417 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 418 static u32 twkill_thread_slots;
 419
 420 /* Returns non-zero if quota exceeded.  */
 421 static int tcp_do_twkill_work(int slot, unsigned int quota)
 422 {
 423         struct tcp_tw_bucket *tw;
 424         struct hlist_node *node, *safe;
 425         unsigned int killed;
 426         int ret;
 427
 428         /* NOTE: compare this to previous version where lock
 429          * was released after detaching chain. It was racy,
 430          * because tw buckets are scheduled in not serialized context
 431          * in 2.3 (with netfilter), and with softnet it is common, because
 432          * soft irqs are not sequenced.
 433          */
 434         killed = 0;
 435         ret = 0;
 436         tw_for_each_inmate(tw, node, safe,
 437                            &tcp_tw_death_row[slot]) {
 438                 __tw_del_dead_node(tw);
 439                 spin_unlock(&tw_death_lock);
 440                 tcp_timewait_kill(tw);
 441                 tcp_tw_put(tw);
 442                 killed++;
 443                 spin_lock(&tw_death_lock);
 444                 if (killed > quota) {
 445                         ret = 1;
 446                         break;
 447                 }
 448         }
 449
 450         tcp_tw_count -= killed;
 451         NET_ADD_STATS_BH(TimeWaited, killed);
 452
 453         return ret;
 454 }
 455
 456 static void tcp_twkill(unsigned long dummy)
 457 {
 458         int need_timer, ret;
 459
 460         spin_lock(&tw_death_lock);
 461
 462         if (tcp_tw_count == 0)
 463                 goto out;
 464
 465         need_timer = 0;
 466         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 467         if (ret) {
 468                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 469                 mb();
 470                 schedule_work(&tcp_twkill_work);
 471                 need_timer = 1;
 472         } else {
 473                 /* We purged the entire slot, anything left?  */
 474                 if (tcp_tw_count)
 475                         need_timer = 1;
 476         }
 477         tcp_tw_death_row_slot =
 478                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 479         if (need_timer)
 480                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 481 out:
 482         spin_unlock(&tw_death_lock);
 483 }
 484
 485 extern void twkill_slots_invalid(void);
 486
 487 static void twkill_work(void *dummy)
 488 {
 489         int i;
 490
 491         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 492                 twkill_slots_invalid();
 493
 494         while (twkill_thread_slots) {
 495                 spin_lock_bh(&tw_death_lock);
 496                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 497                         if (!(twkill_thread_slots & (1 << i)))
 498                                 continue;
 499
 500                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 501                                 if (need_resched()) {
 502                                         spin_unlock_bh(&tw_death_lock);
 503                                         schedule();
 504                                         spin_lock_bh(&tw_death_lock);
 505                                 }
 506                         }
 507
 508                         twkill_thread_slots &= ~(1 << i);
 509                 }
 510                 spin_unlock_bh(&tw_death_lock);
 511         }
 512 }
 513
 514 /* These are always called from BH context.  See callers in
 515  * tcp_input.c to verify this.
 516  */
 517
 518 /* This is for handling early-kills of TIME_WAIT sockets. */
 519 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
 520 {
 521         spin_lock(&tw_death_lock);
 522         if (tw_del_dead_node(tw)) {
 523                 tcp_tw_put(tw);
 524                 if (--tcp_tw_count == 0)
 525                         del_timer(&tcp_tw_timer);
 526         }
 527         spin_unlock(&tw_death_lock);
 528         tcp_timewait_kill(tw);
 529 }
 530
 531 /* Short-time timewait calendar */
 532
 533 static int tcp_twcal_hand = -1;
 534 static int tcp_twcal_jiffie;
 535 static void tcp_twcal_tick(unsigned long);
 536 static struct timer_list tcp_twcal_timer =
 537                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 538 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 539
 540 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
 541 {
 542         struct hlist_head *list;
 543         int slot;
 544
 545         /* timeout := RTO * 3.5
 546          *
 547          * 3.5 = 1+2+0.5 to wait for two retransmits.
 548          *
 549          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 550          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 551          * FINs (or previous seqments) are lost (probability of such event
 552          * is p^(N+1), where p is probability to lose single packet and
 553          * time to detect the loss is about RTO*(2^N - 1) with exponential
 554          * backoff). Normal timewait length is calculated so, that we
 555          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 556          * [ BTW Linux. following BSD, violates this requirement waiting
 557          *   only for 60sec, we should wait at least for 240 secs.
 558          *   Well, 240 consumes too much of resources 8)
 559          * ]
 560          * This interval is not reduced to catch old duplicate and
 561          * responces to our wandering segments living for two MSLs.
 562          * However, if we use PAWS to detect
 563          * old duplicates, we can reduce the interval to bounds required
 564          * by RTO, rather than MSL. So, if peer understands PAWS, we
 565          * kill tw bucket after 3.5*RTO (it is important that this number
 566          * is greater than TS tick!) and detect old duplicates with help
 567          * of PAWS.
 568          */
 569         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 570
 571         spin_lock(&tw_death_lock);
 572
 573         /* Unlink it, if it was scheduled */
 574         if (tw_del_dead_node(tw))
 575                 tcp_tw_count--;
 576         else
 577                 atomic_inc(&tw->tw_refcnt);
 578
 579         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 580                 /* Schedule to slow timer */
 581                 if (timeo >= TCP_TIMEWAIT_LEN) {
 582                         slot = TCP_TWKILL_SLOTS-1;
 583                 } else {
 584                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 585                         if (slot >= TCP_TWKILL_SLOTS)
 586                                 slot = TCP_TWKILL_SLOTS-1;
 587                 }
 588                 tw->tw_ttd = jiffies + timeo;
 589                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 590                 list = &tcp_tw_death_row[slot];
 591         } else {
 592                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 593
 594                 if (tcp_twcal_hand < 0) {
 595                         tcp_twcal_hand = 0;
 596                         tcp_twcal_jiffie = jiffies;
 597                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 598                         add_timer(&tcp_twcal_timer);
 599                 } else {
 600                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 601                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 602                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 603                 }
 604                 list = &tcp_twcal_row[slot];
 605         }
 606
 607         hlist_add_head(&tw->tw_death_node, list);
 608
 609         if (tcp_tw_count++ == 0)
 610                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 611         spin_unlock(&tw_death_lock);
 612 }
 613
 614 void tcp_twcal_tick(unsigned long dummy)
 615 {
 616         int n, slot;
 617         unsigned long j;
 618         unsigned long now = jiffies;
 619         int killed = 0;
 620         int adv = 0;
 621
 622         spin_lock(&tw_death_lock);
 623         if (tcp_twcal_hand < 0)
 624                 goto out;
 625
 626         slot = tcp_twcal_hand;
 627         j = tcp_twcal_jiffie;
 628
 629         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 630                 if (time_before_eq(j, now)) {
 631                         struct hlist_node *node, *safe;
 632                         struct tcp_tw_bucket *tw;
 633
 634                         tw_for_each_inmate(tw, node, safe,
 635                                            &tcp_twcal_row[slot]) {
 636                                 __tw_del_dead_node(tw);
 637                                 tcp_timewait_kill(tw);
 638                                 tcp_tw_put(tw);
 639                                 killed++;
 640                         }
 641                 } else {
 642                         if (!adv) {
 643                                 adv = 1;
 644                                 tcp_twcal_jiffie = j;
 645                                 tcp_twcal_hand = slot;
 646                         }
 647
 648                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 649                                 mod_timer(&tcp_twcal_timer, j);
 650                                 goto out;
 651                         }
 652                 }
 653                 j += (1<<TCP_TW_RECYCLE_TICK);
 654                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 655         }
 656         tcp_twcal_hand = -1;
 657
 658 out:
 659         if ((tcp_tw_count -= killed) == 0)
 660                 del_timer(&tcp_tw_timer);
 661         NET_ADD_STATS_BH(TimeWaitKilled, killed);
 662         spin_unlock(&tw_death_lock);
 663 }
 664
 665 /* This is not only more efficient than what we used to do, it eliminates
 666  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 667  *
 668  * Actually, we could lots of memory writes here. tp of listening
 669  * socket contains all necessary default parameters.
 670  */
 671 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
 672 {
 673         /* allocate the newsk from the same slab of the master sock,
 674          * if not, at sk_free time we'll try to free it from the wrong
 675          * slabcache (i.e. is it TCPv4 or v6?) -acme */
 676         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_slab);
 677
 678         if(newsk != NULL) {
 679                 struct tcp_opt *newtp;
 680                 struct sk_filter *filter;
 681
 682                 memcpy(newsk, sk, sizeof(struct tcp_sock));
 683                 newsk->sk_state = TCP_SYN_RECV;
 684
 685                 /* SANITY */
 686                 sk_node_init(&newsk->sk_node);
 687                 tcp_sk(newsk)->bind_hash = NULL;
 688
 689                 /* Clone the TCP header template */
 690                 inet_sk(newsk)->dport = req->rmt_port;
 691
 692                 sock_lock_init(newsk);
 693                 bh_lock_sock(newsk);
 694
 695                 newsk->sk_dst_lock = RW_LOCK_UNLOCKED;
 696                 atomic_set(&newsk->sk_rmem_alloc, 0);
 697                 skb_queue_head_init(&newsk->sk_receive_queue);
 698                 atomic_set(&newsk->sk_wmem_alloc, 0);
 699                 skb_queue_head_init(&newsk->sk_write_queue);
 700                 atomic_set(&newsk->sk_omem_alloc, 0);
 701                 newsk->sk_wmem_queued = 0;
 702                 newsk->sk_forward_alloc = 0;
 703
 704                 sock_reset_flag(newsk, SOCK_DONE);
 705                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 706                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
 707                 newsk->sk_callback_lock = RW_LOCK_UNLOCKED;
 708                 skb_queue_head_init(&newsk->sk_error_queue);
 709                 newsk->sk_write_space = tcp_write_space;
 710
 711                 if ((filter = newsk->sk_filter) != NULL)
 712                         sk_filter_charge(newsk, filter);
 713
 714                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 715                         /* It is still raw copy of parent, so invalidate
 716                          * destructor and make plain sk_free() */
 717                         newsk->sk_destruct = NULL;
 718                         sk_free(newsk);
 719                         return NULL;
 720                 }
 721
 722                 /* Now setup tcp_opt */
 723                 newtp = tcp_sk(newsk);
 724                 newtp->pred_flags = 0;
 725                 newtp->rcv_nxt = req->rcv_isn + 1;
 726                 newtp->snd_nxt = req->snt_isn + 1;
 727                 newtp->snd_una = req->snt_isn + 1;
 728                 newtp->snd_sml = req->snt_isn + 1;
 729
 730                 tcp_prequeue_init(newtp);
 731
 732                 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
 733
 734                 newtp->retransmits = 0;
 735                 newtp->backoff = 0;
 736                 newtp->srtt = 0;
 737                 newtp->mdev = TCP_TIMEOUT_INIT;
 738                 newtp->rto = TCP_TIMEOUT_INIT;
 739
 740                 newtp->packets_out = 0;
 741                 newtp->left_out = 0;
 742                 newtp->retrans_out = 0;
 743                 newtp->sacked_out = 0;
 744                 newtp->fackets_out = 0;
 745                 newtp->snd_ssthresh = 0x7fffffff;
 746
 747                 /* So many TCP implementations out there (incorrectly) count the
 748                  * initial SYN frame in their delayed-ACK and congestion control
 749                  * algorithms that we must have the following bandaid to talk
 750                  * efficiently to them.  -DaveM
 751                  */
 752                 newtp->snd_cwnd = 2;
 753                 newtp->snd_cwnd_cnt = 0;
 754
 755                 newtp->frto_counter = 0;
 756                 newtp->frto_highmark = 0;
 757
 758                 newtp->ca_state = TCP_CA_Open;
 759                 tcp_init_xmit_timers(newsk);
 760                 skb_queue_head_init(&newtp->out_of_order_queue);
 761                 newtp->send_head = NULL;
 762                 newtp->rcv_wup = req->rcv_isn + 1;
 763                 newtp->write_seq = req->snt_isn + 1;
 764                 newtp->pushed_seq = newtp->write_seq;
 765                 newtp->copied_seq = req->rcv_isn + 1;
 766
 767                 newtp->saw_tstamp = 0;
 768
 769                 newtp->dsack = 0;
 770                 newtp->eff_sacks = 0;
 771
 772                 newtp->probes_out = 0;
 773                 newtp->num_sacks = 0;
 774                 newtp->urg_data = 0;
 775                 newtp->listen_opt = NULL;
 776                 newtp->accept_queue = newtp->accept_queue_tail = NULL;
 777                 /* Deinitialize syn_wait_lock to trap illegal accesses. */
 778                 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 779
 780                 /* Back to base struct sock members. */
 781                 newsk->sk_err = 0;
 782                 newsk->sk_priority = 0;
 783                 atomic_set(&newsk->sk_refcnt, 2);
 784 #ifdef INET_REFCNT_DEBUG
 785                 atomic_inc(&inet_sock_nr);
 786 #endif
 787                 atomic_inc(&tcp_sockets_allocated);
 788
 789                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 790                         tcp_reset_keepalive_timer(newsk,
 791                                                   keepalive_time_when(newtp));
 792                 newsk->sk_socket = NULL;
 793                 newsk->sk_sleep = NULL;
 794                 newsk->sk_owner = NULL;
 795
 796                 newtp->tstamp_ok = req->tstamp_ok;
 797                 if((newtp->sack_ok = req->sack_ok) != 0) {
 798                         if (sysctl_tcp_fack)
 799                                 newtp->sack_ok |= 2;
 800                 }
 801                 newtp->window_clamp = req->window_clamp;
 802                 newtp->rcv_ssthresh = req->rcv_wnd;
 803                 newtp->rcv_wnd = req->rcv_wnd;
 804                 newtp->wscale_ok = req->wscale_ok;
 805                 if (newtp->wscale_ok) {
 806                         newtp->snd_wscale = req->snd_wscale;
 807                         newtp->rcv_wscale = req->rcv_wscale;
 808                 } else {
 809                         newtp->snd_wscale = newtp->rcv_wscale = 0;
 810                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 811                 }
 812                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
 813                 newtp->max_window = newtp->snd_wnd;
 814
 815                 if (newtp->tstamp_ok) {
 816                         newtp->ts_recent = req->ts_recent;
 817                         newtp->ts_recent_stamp = xtime.tv_sec;
 818                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 819                 } else {
 820                         newtp->ts_recent_stamp = 0;
 821                         newtp->tcp_header_len = sizeof(struct tcphdr);
 822                 }
 823                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 824                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 825                 newtp->mss_clamp = req->mss;
 826                 TCP_ECN_openreq_child(newtp, req);
 827                 if (newtp->ecn_flags&TCP_ECN_OK)
 828                         newsk->sk_no_largesend = 1;
 829
 830                 TCP_INC_STATS_BH(TcpPassiveOpens);
 831         }
 832         return newsk;
 833 }
 834
 835 /*
 836  *      Process an incoming packet for SYN_RECV sockets represented
 837  *      as an open_request.
 838  */
 839
 840 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 841                            struct open_request *req,
 842                            struct open_request **prev)
 843 {
 844         struct tcphdr *th = skb->h.th;
 845         struct tcp_opt *tp = tcp_sk(sk);
 846         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 847         int paws_reject = 0;
 848         struct tcp_opt ttp;
 849         struct sock *child;
 850
 851         ttp.saw_tstamp = 0;
 852         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 853                 tcp_parse_options(skb, &ttp, 0);
 854
 855                 if (ttp.saw_tstamp) {
 856                         ttp.ts_recent = req->ts_recent;
 857                         /* We do not store true stamp, but it is not required,
 858                          * it can be estimated (approximately)
 859                          * from another data.
 860                          */
 861                         ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 862                         paws_reject = tcp_paws_check(&ttp, th->rst);
 863                 }
 864         }
 865
 866         /* Check for pure retransmitted SYN. */
 867         if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
 868             flg == TCP_FLAG_SYN &&
 869             !paws_reject) {
 870                 /*
 871                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 872                  * this case on figure 6 and figure 8, but formal
 873                  * protocol description says NOTHING.
 874                  * To be more exact, it says that we should send ACK,
 875                  * because this segment (at least, if it has no data)
 876                  * is out of window.
 877                  *
 878                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 879                  *  describe SYN-RECV state. All the description
 880                  *  is wrong, we cannot believe to it and should
 881                  *  rely only on common sense and implementation
 882                  *  experience.
 883                  *
 884                  * Enforce "SYN-ACK" according to figure 8, figure 6
 885                  * of RFC793, fixed by RFC1122.
 886                  */
 887                 req->class->rtx_syn_ack(sk, req, NULL);
 888                 return NULL;
 889         }
 890
 891         /* Further reproduces section "SEGMENT ARRIVES"
 892            for state SYN-RECEIVED of RFC793.
 893            It is broken, however, it does not work only
 894            when SYNs are crossed.
 895
 896            You would think that SYN crossing is impossible here, since
 897            we should have a SYN_SENT socket (from connect()) on our end,
 898            but this is not true if the crossed SYNs were sent to both
 899            ends by a malicious third party.  We must defend against this,
 900            and to do that we first verify the ACK (as per RFC793, page
 901            36) and reset if it is invalid.  Is this a true full defense?
 902            To convince ourselves, let us consider a way in which the ACK
 903            test can still pass in this 'malicious crossed SYNs' case.
 904            Malicious sender sends identical SYNs (and thus identical sequence
 905            numbers) to both A and B:
 906
 907                 A: gets SYN, seq=7
 908                 B: gets SYN, seq=7
 909
 910            By our good fortune, both A and B select the same initial
 911            send sequence number of seven :-)
 912
 913                 A: sends SYN|ACK, seq=7, ack_seq=8
 914                 B: sends SYN|ACK, seq=7, ack_seq=8
 915
 916            So we are now A eating this SYN|ACK, ACK test passes.  So
 917            does sequence test, SYN is truncated, and thus we consider
 918            it a bare ACK.
 919
 920            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 921            we create an established connection.  Both ends (listening sockets)
 922            accept the new incoming connection and try to talk to each other. 8-)
 923
 924            Note: This case is both harmless, and rare.  Possibility is about the
 925            same as us discovering intelligent life on another plant tomorrow.
 926
 927            But generally, we should (RFC lies!) to accept ACK
 928            from SYNACK both here and in tcp_rcv_state_process().
 929            tcp_rcv_state_process() does not, hence, we do not too.
 930
 931            Note that the case is absolutely generic:
 932            we cannot optimize anything here without
 933            violating protocol. All the checks must be made
 934            before attempt to create socket.
 935          */
 936
 937         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 938          *                  and the incoming segment acknowledges something not yet
 939          *                  sent (the segment carries an unaccaptable ACK) ...
 940          *                  a reset is sent."
 941          *
 942          * Invalid ACK: reset will be sent by listening socket
 943          */
 944         if ((flg & TCP_FLAG_ACK) &&
 945             (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
 946                 return sk;
 947
 948         /* Also, it would be not so bad idea to check rcv_tsecr, which
 949          * is essentially ACK extension and too early or too late values
 950          * should cause reset in unsynchronized states.
 951          */
 952
 953         /* RFC793: "first check sequence number". */
 954
 955         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 956                                           req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
 957                 /* Out of window: send ACK and drop. */
 958                 if (!(flg & TCP_FLAG_RST))
 959                         req->class->send_ack(skb, req);
 960                 if (paws_reject)
 961                         NET_INC_STATS_BH(PAWSEstabRejected);
 962                 return NULL;
 963         }
 964
 965         /* In sequence, PAWS is OK. */
 966
 967         if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
 968                 req->ts_recent = ttp.rcv_tsval;
 969
 970         if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
 971                 /* Truncate SYN, it is out of window starting
 972                    at req->rcv_isn+1. */
 973                 flg &= ~TCP_FLAG_SYN;
 974         }
 975
 976         /* RFC793: "second check the RST bit" and
 977          *         "fourth, check the SYN bit"
 978          */
 979         if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
 980                 goto embryonic_reset;
 981
 982         /* ACK sequence verified above, just make sure ACK is
 983          * set.  If ACK not set, just silently drop the packet.
 984          */
 985         if (!(flg & TCP_FLAG_ACK))
 986                 return NULL;
 987
 988         /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 989         if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
 990                 req->acked = 1;
 991                 return NULL;
 992         }
 993
 994         /* OK, ACK is valid, create big socket and
 995          * feed this segment to it. It will repeat all
 996          * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 997          * ESTABLISHED STATE. If it will be dropped after
 998          * socket is created, wait for troubles.
 999          */
1000         child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1001         if (child == NULL)
1002                 goto listen_overflow;
1003
1004         sk_set_owner(child, sk->sk_owner);
1005         tcp_synq_unlink(tp, req, prev);
1006         tcp_synq_removed(sk, req);
1007
1008         tcp_acceptq_queue(sk, req, child);
1009         return child;
1010
1011 listen_overflow:
1012         if (!sysctl_tcp_abort_on_overflow) {
1013                 req->acked = 1;
1014                 return NULL;
1015         }
1016
1017 embryonic_reset:
1018         NET_INC_STATS_BH(EmbryonicRsts);
1019         if (!(flg & TCP_FLAG_RST))
1020                 req->class->send_reset(skb);
1021
1022         tcp_synq_drop(sk, req, prev);
1023         return NULL;
1024 }
1025
1026 /*
1027  * Queue segment on the new socket if the new socket is active,
1028  * otherwise we just shortcircuit this and continue with
1029  * the new socket.
1030  */
1031
1032 int tcp_child_process(struct sock *parent, struct sock *child,
1033                       struct sk_buff *skb)
1034 {
1035         int ret = 0;
1036         int state = child->sk_state;
1037
1038         if (!sock_owned_by_user(child)) {
1039                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1040
1041                 /* Wakeup parent, send SIGIO */
1042                 if (state == TCP_SYN_RECV && child->sk_state != state)
1043                         parent->sk_data_ready(parent, 0);
1044         } else {
1045                 /* Alas, it is possible again, because we do lookup
1046                  * in main socket hash table and lock on listening
1047                  * socket does not protect us more.
1048                  */
1049                 sk_add_backlog(child, skb);
1050         }
1051
1052         bh_unlock_sock(child);
1053         sock_put(child);
1054         return ret;
1055 }