net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/init.h>
  55
  56 #include <net/icmp.h>
  57 #include <net/tcp.h>
  58 #include <net/ipv6.h>
  59 #include <net/inet_common.h>
  60
  61 #include <linux/inet.h>
  62 #include <linux/stddef.h>
  63 #include <linux/ipsec.h>
  64
  65 extern int sysctl_ip_dynaddr;
  66
  67 /* Check TCP sequence numbers in ICMP packets. */
  68 #define ICMP_MIN_LENGTH 8
  69
  70 /* Socket used for sending RSTs */
  71 struct inode tcp_inode;
  72 struct socket *tcp_socket=&tcp_inode.u.socket_i;
  73
  74 static void tcp_v4_send_reset(struct sk_buff *skb);
  75
  76 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  77                        struct sk_buff *skb);
  78
  79 /* This is for sockets with full identity only.  Sockets here will always
  80  * be without wildcards and will have the following invariant:
  81  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  82  *
  83  * First half of the table is for sockets not in TIME_WAIT, second half
  84  * is for TIME_WAIT sockets only.
  85  */
  86 struct tcp_ehash_bucket *tcp_ehash = NULL;
  87
  88 /* Ok, let's try this, I give up, we do need a local binding
  89  * TCP hash as well as the others for fast bind/connect.
  90  */
  91 struct tcp_bind_hashbucket *tcp_bhash = NULL;
  92
  93 int tcp_bhash_size = 0;
  94 int tcp_ehash_size = 0;
  95
  96 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  97  * where wildcard'd TCP sockets can exist.  Hash function here is just local
  98  * port number.
  99  */
 100 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
 101 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
 102                                 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
 103                                   (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
 104
 105 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
 106 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
 107 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
 108
 109 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
 110
 111 /*
 112  * This array holds the first and last local port number.
 113  * For high-usage systems, use sysctl to change this to
 114  * 32768-61000
 115  */
 116 int sysctl_local_port_range[2] = { 1024, 4999 };
 117 int tcp_port_rover = (1024 - 1);
 118
 119 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 120                                  __u32 faddr, __u16 fport)
 121 {
 122         int h = ((laddr ^ lport) ^ (faddr ^ fport));
 123         h ^= h>>16;
 124         h ^= h>>8;
 125         return h & (tcp_ehash_size - 1);
 126 }
 127
 128 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 129 {
 130         __u32 laddr = sk->rcv_saddr;
 131         __u16 lport = sk->num;
 132         __u32 faddr = sk->daddr;
 133         __u16 fport = sk->dport;
 134
 135         return tcp_hashfn(laddr, lport, faddr, fport);
 136 }
 137
 138 /* Allocate and initialize a new TCP local port bind bucket.
 139  * The bindhash mutex for snum's hash chain must be held here.
 140  */
 141 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 142                                           unsigned short snum)
 143 {
 144         struct tcp_bind_bucket *tb;
 145
 146         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 147         if(tb != NULL) {
 148                 tb->port = snum;
 149                 tb->fastreuse = 0;
 150                 tb->owners = NULL;
 151                 if((tb->next = head->chain) != NULL)
 152                         tb->next->pprev = &tb->next;
 153                 head->chain = tb;
 154                 tb->pprev = &head->chain;
 155         }
 156         return tb;
 157 }
 158
 159 /* Caller must disable local BH processing. */
 160 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 161 {
 162         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
 163         struct tcp_bind_bucket *tb;
 164
 165         spin_lock(&head->lock);
 166         tb = (struct tcp_bind_bucket *)sk->prev;
 167         if ((child->bind_next = tb->owners) != NULL)
 168                 tb->owners->bind_pprev = &child->bind_next;
 169         tb->owners = child;
 170         child->bind_pprev = &tb->owners;
 171         child->prev = (struct sock *) tb;
 172         spin_unlock(&head->lock);
 173 }
 174
 175 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
 176 {
 177         local_bh_disable();
 178         __tcp_inherit_port(sk, child);
 179         local_bh_enable();
 180 }
 181
 182 /* Obtain a reference to a local port for the given sock,
 183  * if snum is zero it means select any available local port.
 184  */
 185 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 186 {
 187         struct tcp_bind_hashbucket *head;
 188         struct tcp_bind_bucket *tb;
 189         int ret;
 190
 191         local_bh_disable();
 192         if (snum == 0) {
 193                 int low = sysctl_local_port_range[0];
 194                 int high = sysctl_local_port_range[1];
 195                 int remaining = (high - low) + 1;
 196                 int rover;
 197
 198                 spin_lock(&tcp_portalloc_lock);
 199                 rover = tcp_port_rover;
 200                 do {    rover++;
 201                         if ((rover < low) || (rover > high))
 202                                 rover = low;
 203                         head = &tcp_bhash[tcp_bhashfn(rover)];
 204                         spin_lock(&head->lock);
 205                         for (tb = head->chain; tb; tb = tb->next)
 206                                 if (tb->port == rover)
 207                                         goto next;
 208                         break;
 209                 next:
 210                         spin_unlock(&head->lock);
 211                 } while (--remaining > 0);
 212                 tcp_port_rover = rover;
 213                 spin_unlock(&tcp_portalloc_lock);
 214
 215                 /* Exhausted local port range during search? */
 216                 ret = 1;
 217                 if (remaining <= 0)
 218                         goto fail;
 219
 220                 /* OK, here is the one we will use.  HEAD is
 221                  * non-NULL and we hold it's mutex.
 222                  */
 223                 snum = rover;
 224                 tb = NULL;
 225         } else {
 226                 head = &tcp_bhash[tcp_bhashfn(snum)];
 227                 spin_lock(&head->lock);
 228                 for (tb = head->chain; tb != NULL; tb = tb->next)
 229                         if (tb->port == snum)
 230                                 break;
 231         }
 232         if (tb != NULL && tb->owners != NULL) {
 233                 if (tb->fastreuse != 0 && sk->reuse != 0) {
 234                         goto success;
 235                 } else {
 236                         struct sock *sk2 = tb->owners;
 237                         int sk_reuse = sk->reuse;
 238
 239                         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
 240                                 if (sk->bound_dev_if == sk2->bound_dev_if) {
 241                                         if (!sk_reuse   ||
 242                                             !sk2->reuse ||
 243                                             sk2->state == TCP_LISTEN) {
 244                                                 if (!sk2->rcv_saddr     ||
 245                                                     !sk->rcv_saddr      ||
 246                                                     (sk2->rcv_saddr == sk->rcv_saddr))
 247                                                         break;
 248                                         }
 249                                 }
 250                         }
 251                         /* If we found a conflict, fail. */
 252                         ret = 1;
 253                         if (sk2 != NULL)
 254                                 goto fail_unlock;
 255                 }
 256         }
 257         ret = 1;
 258         if (tb == NULL &&
 259             (tb = tcp_bucket_create(head, snum)) == NULL)
 260                         goto fail_unlock;
 261         if (tb->owners == NULL) {
 262                 if (sk->reuse && sk->state != TCP_LISTEN)
 263                         tb->fastreuse = 1;
 264                 else
 265                         tb->fastreuse = 0;
 266         } else if (tb->fastreuse &&
 267                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
 268                 tb->fastreuse = 0;
 269 success:
 270         sk->num = snum;
 271         if ((sk->bind_next = tb->owners) != NULL)
 272                 tb->owners->bind_pprev = &sk->bind_next;
 273         tb->owners = sk;
 274         sk->bind_pprev = &tb->owners;
 275         sk->prev = (struct sock *) tb;
 276         ret = 0;
 277
 278 fail_unlock:
 279         spin_unlock(&head->lock);
 280 fail:
 281         local_bh_enable();
 282         return ret;
 283 }
 284
 285 /* Get rid of any references to a local port held by the
 286  * given sock.
 287  */
 288 __inline__ void __tcp_put_port(struct sock *sk)
 289 {
 290         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
 291         struct tcp_bind_bucket *tb;
 292
 293         spin_lock(&head->lock);
 294         tb = (struct tcp_bind_bucket *) sk->prev;
 295         if (sk->bind_next)
 296                 sk->bind_next->bind_pprev = sk->bind_pprev;
 297         *(sk->bind_pprev) = sk->bind_next;
 298         sk->prev = NULL;
 299         if (tb->owners == NULL) {
 300                 if (tb->next)
 301                         tb->next->pprev = tb->pprev;
 302                 *(tb->pprev) = tb->next;
 303                 kmem_cache_free(tcp_bucket_cachep, tb);
 304         }
 305         spin_unlock(&head->lock);
 306 }
 307
 308 void tcp_put_port(struct sock *sk)
 309 {
 310         local_bh_disable();
 311         __tcp_put_port(sk);
 312         local_bh_enable();
 313 }
 314
 315 /* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
 316  * Look, when several writers sleep and reader wakes them up, all but one
 317  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 318  * this, _but_ remember, it adds useless work on UP machines (wake up each
 319  * exclusive lock release). It should be ifdefed really.
 320  */
 321
 322 void tcp_listen_wlock(void)
 323 {
 324         write_lock(&tcp_lhash_lock);
 325
 326         if (atomic_read(&tcp_lhash_users)) {
 327                 DECLARE_WAITQUEUE(wait, current);
 328
 329                 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
 330                 for (;;) {
 331                         set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
 332                         if (atomic_read(&tcp_lhash_users) == 0)
 333                                 break;
 334                         write_unlock_bh(&tcp_lhash_lock);
 335                         schedule();
 336                         write_lock_bh(&tcp_lhash_lock);
 337                 }
 338
 339                 __set_current_state(TASK_RUNNING);
 340                 remove_wait_queue(&tcp_lhash_wait, &wait);
 341         }
 342 }
 343
 344 static __inline__ void __tcp_v4_hash(struct sock *sk)
 345 {
 346         struct sock **skp;
 347         rwlock_t *lock;
 348
 349         BUG_TRAP(sk->pprev==NULL);
 350         if(sk->state == TCP_LISTEN) {
 351                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 352                 lock = &tcp_lhash_lock;
 353                 tcp_listen_wlock();
 354         } else {
 355                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
 356                 lock = &tcp_ehash[sk->hashent].lock;
 357                 write_lock(lock);
 358         }
 359         if((sk->next = *skp) != NULL)
 360                 (*skp)->pprev = &sk->next;
 361         *skp = sk;
 362         sk->pprev = skp;
 363         sock_prot_inc_use(sk->prot);
 364         write_unlock(lock);
 365         if (sk->state == TCP_LISTEN)
 366                 wake_up(&tcp_lhash_wait);
 367 }
 368
 369 static void tcp_v4_hash(struct sock *sk)
 370 {
 371         if (sk->state != TCP_CLOSE) {
 372                 local_bh_disable();
 373                 __tcp_v4_hash(sk);
 374                 local_bh_enable();
 375         }
 376 }
 377
 378 void tcp_unhash(struct sock *sk)
 379 {
 380         rwlock_t *lock;
 381
 382         if (sk->state == TCP_LISTEN) {
 383                 local_bh_disable();
 384                 tcp_listen_wlock();
 385                 lock = &tcp_lhash_lock;
 386         } else {
 387                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
 388                 lock = &head->lock;
 389                 write_lock_bh(&head->lock);
 390         }
 391
 392         if(sk->pprev) {
 393                 if(sk->next)
 394                         sk->next->pprev = sk->pprev;
 395                 *sk->pprev = sk->next;
 396                 sk->pprev = NULL;
 397                 sock_prot_dec_use(sk->prot);
 398         }
 399         write_unlock_bh(lock);
 400         if (sk->state == TCP_LISTEN)
 401                 wake_up(&tcp_lhash_wait);
 402 }
 403
 404 /* Don't inline this cruft.  Here are some nice properties to
 405  * exploit here.  The BSD API does not allow a listening TCP
 406  * to specify the remote port nor the remote address for the
 407  * connection.  So always assume those are both wildcarded
 408  * during the search since they can never be otherwise.
 409  */
 410 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
 411 {
 412         struct sock *result = NULL;
 413         int score, hiscore;
 414
 415         hiscore=0;
 416         for(; sk; sk = sk->next) {
 417                 if(sk->num == hnum) {
 418                         __u32 rcv_saddr = sk->rcv_saddr;
 419
 420                         score = 1;
 421                         if(rcv_saddr) {
 422                                 if (rcv_saddr != daddr)
 423                                         continue;
 424                                 score++;
 425                         }
 426                         if (sk->bound_dev_if) {
 427                                 if (sk->bound_dev_if != dif)
 428                                         continue;
 429                                 score++;
 430                         }
 431                         if (score == 3)
 432                                 return sk;
 433                         if (score > hiscore) {
 434                                 hiscore = score;
 435                                 result = sk;
 436                         }
 437                 }
 438         }
 439         return result;
 440 }
 441
 442 /* Optimize the common listener case. */
 443 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 444 {
 445         struct sock *sk;
 446
 447         read_lock(&tcp_lhash_lock);
 448         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
 449         if (sk) {
 450                 if (sk->num == hnum &&
 451                     sk->next == NULL &&
 452                     (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
 453                     !sk->bound_dev_if)
 454                         goto sherry_cache;
 455                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
 456         }
 457         if (sk) {
 458 sherry_cache:
 459                 sock_hold(sk);
 460         }
 461         read_unlock(&tcp_lhash_lock);
 462         return sk;
 463 }
 464
 465 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 466  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 467  *
 468  * Local BH must be disabled here.
 469  */
 470
 471 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 472                                                        u32 daddr, u16 hnum, int dif)
 473 {
 474         struct tcp_ehash_bucket *head;
 475         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 476         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 477         struct sock *sk;
 478         int hash;
 479
 480         /* Optimize here for direct hit, only listening connections can
 481          * have wildcards anyways.
 482          */
 483         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 484         head = &tcp_ehash[hash];
 485         read_lock(&head->lock);
 486         for(sk = head->chain; sk; sk = sk->next) {
 487                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 488                         goto hit; /* You sunk my battleship! */
 489         }
 490
 491         /* Must check for a TIME_WAIT'er before going to listener hash. */
 492         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
 493                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 494                         goto hit;
 495         read_unlock(&head->lock);
 496
 497         return NULL;
 498
 499 hit:
 500         sock_hold(sk);
 501         read_unlock(&head->lock);
 502         return sk;
 503 }
 504
 505 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 506                                            u32 daddr, u16 hnum, int dif)
 507 {
 508         struct sock *sk;
 509
 510         sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
 511
 512         if (sk)
 513                 return sk;
 514
 515         return tcp_v4_lookup_listener(daddr, hnum, dif);
 516 }
 517
 518 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 519 {
 520         struct sock *sk;
 521
 522         local_bh_disable();
 523         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 524         local_bh_enable();
 525
 526         return sk;
 527 }
 528
 529 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 530 {
 531         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 532                                           skb->nh.iph->saddr,
 533                                           skb->h.th->dest,
 534                                           skb->h.th->source);
 535 }
 536
 537 static int tcp_v4_check_established(struct sock *sk)
 538 {
 539         u32 daddr = sk->rcv_saddr;
 540         u32 saddr = sk->daddr;
 541         int dif = sk->bound_dev_if;
 542         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 543         __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
 544         int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 545         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 546         struct sock *sk2, **skp;
 547         struct tcp_tw_bucket *tw;
 548
 549         write_lock_bh(&head->lock);
 550
 551         /* Check TIME-WAIT sockets first. */
 552         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 553             skp = &sk2->next) {
 554                 tw = (struct tcp_tw_bucket*)sk2;
 555
 556                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 557                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 558
 559                         /* With PAWS, it is safe from the viewpoint
 560                            of data integrity. Even without PAWS it
 561                            is safe provided sequence spaces do not
 562                            overlap i.e. at data rates <= 80Mbit/sec.
 563
 564                            Actually, the idea is close to VJ's one,
 565                            only timestamp cache is held not per host,
 566                            but per port pair and TW bucket is used
 567                            as state holder.
 568
 569                            If TW bucket has been already destroyed we
 570                            fall back to VJ's scheme and use initial
 571                            timestamp retrieved from peer table.
 572                          */
 573                         if (tw->substate == TCP_TIME_WAIT &&
 574                             sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
 575                                 if ((tp->write_seq = tw->snd_nxt + 2) == 0)
 576                                         tp->write_seq = 1;
 577                                 tp->ts_recent = tw->ts_recent;
 578                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
 579                                 sock_hold(sk2);
 580                                 skp = &head->chain;
 581                                 goto unique;
 582                         } else
 583                                 goto not_unique;
 584                 }
 585         }
 586         tw = NULL;
 587
 588         /* And established part... */
 589         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
 590                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 591                         goto not_unique;
 592         }
 593
 594 unique:
 595         BUG_TRAP(sk->pprev==NULL);
 596         if ((sk->next = *skp) != NULL)
 597                 (*skp)->pprev = &sk->next;
 598
 599         *skp = sk;
 600         sk->pprev = skp;
 601         sock_prot_inc_use(sk->prot);
 602         write_unlock_bh(&head->lock);
 603
 604         if (tw) {
 605                 /* Silly. Should hash-dance instead... */
 606                 local_bh_disable();
 607                 tcp_tw_deschedule(tw);
 608                 tcp_timewait_kill(tw);
 609                 NET_INC_STATS_BH(TimeWaitRecycled);
 610                 local_bh_enable();
 611
 612                 tcp_tw_put(tw);
 613         }
 614
 615         return 0;
 616
 617 not_unique:
 618         write_unlock_bh(&head->lock);
 619         return -EADDRNOTAVAIL;
 620 }
 621
 622 /* Hash SYN-SENT socket to established hash table after
 623  * checking that it is unique. Note, that without kernel lock
 624  * we MUST make these two operations atomically.
 625  *
 626  * Optimization: if it is bound and tcp_bind_bucket has the only
 627  * owner (us), we need not to scan established bucket.
 628  */
 629
 630 int tcp_v4_hash_connecting(struct sock *sk)
 631 {
 632         unsigned short snum = sk->num;
 633         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
 634         struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
 635
 636         spin_lock_bh(&head->lock);
 637         if (tb->owners == sk && sk->bind_next == NULL) {
 638                 __tcp_v4_hash(sk);
 639                 spin_unlock_bh(&head->lock);
 640                 return 0;
 641         } else {
 642                 spin_unlock_bh(&head->lock);
 643
 644                 /* No definite answer... Walk to established hash table */
 645                 return tcp_v4_check_established(sk);
 646         }
 647 }
 648
 649 /* This will initiate an outgoing connection. */
 650 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 651 {
 652         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 653         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 654         struct sk_buff *buff;
 655         struct rtable *rt;
 656         u32 daddr, nexthop;
 657         int tmp;
 658         int err;
 659
 660         if (addr_len < sizeof(struct sockaddr_in))
 661                 return(-EINVAL);
 662
 663         if (usin->sin_family != AF_INET)
 664                 return(-EAFNOSUPPORT);
 665
 666         nexthop = daddr = usin->sin_addr.s_addr;
 667         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
 668                 if (daddr == 0)
 669                         return -EINVAL;
 670                 nexthop = sk->protinfo.af_inet.opt->faddr;
 671         }
 672
 673         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 674                                RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 675         if (tmp < 0)
 676                 return tmp;
 677
 678         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 679                 ip_rt_put(rt);
 680                 return -ENETUNREACH;
 681         }
 682
 683         __sk_dst_set(sk, &rt->u.dst);
 684
 685         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
 686                 daddr = rt->rt_dst;
 687
 688         err = -ENOBUFS;
 689         buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL);
 690
 691         if (buff == NULL)
 692                 goto failure;
 693
 694         if (!sk->saddr)
 695                 sk->saddr = rt->rt_src;
 696         sk->rcv_saddr = sk->saddr;
 697
 698         if (tp->ts_recent_stamp && sk->daddr != daddr) {
 699                 /* Reset inherited state */
 700                 tp->ts_recent = 0;
 701                 tp->ts_recent_stamp = 0;
 702                 tp->write_seq = 0;
 703         }
 704
 705         if (sysctl_tcp_tw_recycle &&
 706             !tp->ts_recent_stamp &&
 707             rt->rt_dst == daddr) {
 708                 struct inet_peer *peer = rt_get_peer(rt);
 709
 710                 /* VJ's idea. We save last timestamp seen from
 711                  * the destination in peer table, when entering state TIME-WAIT
 712                  * and initialize ts_recent from it, when trying new connection.
 713                  */
 714
 715                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 716                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 717                         tp->ts_recent = peer->tcp_ts;
 718                 }
 719         }
 720
 721         sk->dport = usin->sin_port;
 722         sk->daddr = daddr;
 723
 724         if (!tp->write_seq)
 725                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 726                                                            sk->sport, usin->sin_port);
 727
 728         tp->ext_header_len = 0;
 729         if (sk->protinfo.af_inet.opt)
 730                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
 731
 732         tp->mss_clamp = 536;
 733
 734         err = tcp_connect(sk, buff);
 735         if (err == 0)
 736                 return 0;
 737
 738 failure:
 739         __sk_dst_reset(sk);
 740         sk->dport = 0;
 741         return err;
 742 }
 743
 744 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 745 {
 746         return ((struct rtable*)skb->dst)->rt_iif;
 747 }
 748
 749 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
 750 {
 751         unsigned h = raddr ^ rport;
 752         h ^= h>>16;
 753         h ^= h>>8;
 754         return h&(TCP_SYNQ_HSIZE-1);
 755 }
 756
 757 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 758                                               struct iphdr *iph,
 759                                               struct tcphdr *th,
 760                                               struct open_request ***prevp)
 761 {
 762         struct tcp_listen_opt *lopt = tp->listen_opt;
 763         struct open_request *req, **prev;
 764         __u16 rport = th->source;
 765         __u32 raddr = iph->saddr;
 766
 767         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
 768              (req = *prev) != NULL;
 769              prev = &req->dl_next) {
 770                 if (req->rmt_port == rport &&
 771                     req->af.v4_req.rmt_addr == raddr &&
 772                     req->af.v4_req.loc_addr == iph->daddr &&
 773                     TCP_INET_FAMILY(req->class->family)) {
 774                         BUG_TRAP(req->sk == NULL);
 775                         *prevp = prev;
 776                         return req;
 777                 }
 778         }
 779
 780         return NULL;
 781 }
 782
 783 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 784 {
 785         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 786         struct tcp_listen_opt *lopt = tp->listen_opt;
 787         unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
 788
 789         req->expires = jiffies + TCP_TIMEOUT_INIT;
 790         req->retrans = 0;
 791         req->sk = NULL;
 792         req->index = h;
 793         req->dl_next = lopt->syn_table[h];
 794
 795         write_lock(&tp->syn_wait_lock);
 796         lopt->syn_table[h] = req;
 797         write_unlock(&tp->syn_wait_lock);
 798
 799         tcp_synq_added(sk);
 800 }
 801
 802
 803 /*
 804  * This routine does path mtu discovery as defined in RFC1191.
 805  */
 806 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
 807 {
 808         struct dst_entry *dst;
 809         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 810
 811         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 812          * send out by Linux are always <576bytes so they should go through
 813          * unfragmented).
 814          */
 815         if (sk->state == TCP_LISTEN)
 816                 return;
 817
 818         /* We don't check in the destentry if pmtu discovery is forbidden
 819          * on this route. We just assume that no packet_to_big packets
 820          * are send back when pmtu discovery is not active.
 821          * There is a small race when the user changes this flag in the
 822          * route, but I think that's acceptable.
 823          */
 824         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 825                 return;
 826
 827         ip_rt_update_pmtu(dst, mtu);
 828
 829         /* Something is about to be wrong... Remember soft error
 830          * for the case, if this connection will not able to recover.
 831          */
 832         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
 833                 sk->err_soft = EMSGSIZE;
 834
 835         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
 836             tp->pmtu_cookie > dst->pmtu) {
 837                 tcp_sync_mss(sk, dst->pmtu);
 838
 839                 /* Resend the TCP packet because it's
 840                  * clear that the old packet has been
 841                  * dropped. This is the new "fast" path mtu
 842                  * discovery.
 843                  */
 844                 tcp_simple_retransmit(sk);
 845         } /* else let the usual retransmit timer handle it */
 846 }
 847
 848 /*
 849  * This routine is called by the ICMP module when it gets some
 850  * sort of error condition.  If err < 0 then the socket should
 851  * be closed and the error returned to the user.  If err > 0
 852  * it's just the icmp type << 8 | icmp code.  After adjustment
 853  * header points to the first 8 bytes of the tcp header.  We need
 854  * to find the appropriate port.
 855  *
 856  * The locking strategy used here is very "optimistic". When
 857  * someone else accesses the socket the ICMP is just dropped
 858  * and for some paths there is no check at all.
 859  * A more general error queue to queue errors for later handling
 860  * is probably better.
 861  *
 862  */
 863
 864 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 865 {
 866         struct iphdr *iph = (struct iphdr*)dp;
 867         struct tcphdr *th;
 868         struct tcp_opt *tp;
 869         int type = skb->h.icmph->type;
 870         int code = skb->h.icmph->code;
 871 #if ICMP_MIN_LENGTH < 14
 872         int no_flags = 0;
 873 #else
 874 #define no_flags 0
 875 #endif
 876         struct sock *sk;
 877         __u32 seq;
 878         int err;
 879
 880         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 881                 ICMP_INC_STATS_BH(IcmpInErrors);
 882                 return;
 883         }
 884 #if ICMP_MIN_LENGTH < 14
 885         if (len < (iph->ihl << 2) + 14)
 886                 no_flags = 1;
 887 #endif
 888
 889         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 890
 891         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
 892         if (sk == NULL) {
 893                 ICMP_INC_STATS_BH(IcmpInErrors);
 894                 return;
 895         }
 896         if (sk->state == TCP_TIME_WAIT) {
 897                 tcp_tw_put((struct tcp_tw_bucket*)sk);
 898                 return;
 899         }
 900
 901         bh_lock_sock(sk);
 902         /* If too many ICMPs get dropped on busy
 903          * servers this needs to be solved differently.
 904          */
 905         if (sk->lock.users != 0)
 906                 NET_INC_STATS_BH(LockDroppedIcmps);
 907
 908         if (sk->state == TCP_CLOSE)
 909                 goto out;
 910
 911         tp = &sk->tp_pinfo.af_tcp;
 912         seq = ntohl(th->seq);
 913         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
 914                 NET_INC_STATS(OutOfWindowIcmps);
 915                 goto out;
 916         }
 917
 918         switch (type) {
 919         case ICMP_SOURCE_QUENCH:
 920                 /* This is deprecated, but if someone generated it,
 921                  * we have no reasons to ignore it.
 922                  */
 923                 if (sk->lock.users == 0)
 924                         tcp_enter_cong_avoid(tp);
 925                 goto out;
 926         case ICMP_PARAMETERPROB:
 927                 err = EPROTO;
 928                 break;
 929         case ICMP_DEST_UNREACH:
 930                 if (code > NR_ICMP_UNREACH)
 931                         goto out;
 932
 933                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 934                         if (sk->lock.users == 0)
 935                                 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
 936                         goto out;
 937                 }
 938
 939                 err = icmp_err_convert[code].errno;
 940                 break;
 941         case ICMP_TIME_EXCEEDED:
 942                 err = EHOSTUNREACH;
 943                 break;
 944         default:
 945                 goto out;
 946         }
 947
 948         switch (sk->state) {
 949                 struct open_request *req, **prev;
 950         case TCP_LISTEN:
 951                 if (sk->lock.users != 0)
 952                         goto out;
 953
 954                 /* The final ACK of the handshake should be already
 955                  * handled in the new socket context, not here.
 956                  * Strictly speaking - an ICMP error for the final
 957                  * ACK should set the opening flag, but that is too
 958                  * complicated right now.
 959                  */
 960                 if (!no_flags && !th->syn && !th->ack)
 961                         goto out;
 962
 963                 req = tcp_v4_search_req(tp, iph, th, &prev);
 964                 if (!req)
 965                         goto out;
 966
 967                 /* ICMPs are not backlogged, hence we cannot get
 968                    an established socket here.
 969                  */
 970                 BUG_TRAP(req->sk == NULL);
 971
 972                 if (seq != req->snt_isn) {
 973                         NET_INC_STATS_BH(OutOfWindowIcmps);
 974                         goto out;
 975                 }
 976
 977                 /*
 978                  * Still in SYN_RECV, just remove it silently.
 979                  * There is no good way to pass the error to the newly
 980                  * created socket, and POSIX does not want network
 981                  * errors returned from accept().
 982                  */
 983                 tcp_synq_drop(sk, req, prev);
 984                 goto out;
 985
 986         case TCP_SYN_SENT:
 987         case TCP_SYN_RECV:  /* Cannot happen.
 988                                It can f.e. if SYNs crossed.
 989                              */
 990                 if (!no_flags && !th->syn)
 991                         goto out;
 992                 if (sk->lock.users == 0) {
 993                         TCP_INC_STATS_BH(TcpAttemptFails);
 994                         sk->err = err;
 995
 996                         sk->error_report(sk);
 997
 998                         tcp_done(sk);
 999                 } else {
1000                         sk->err_soft = err;
1001                 }
1002                 goto out;
1003         }
1004
1005         /* If we've already connected we will keep trying
1006          * until we time out, or the user gives up.
1007          *
1008          * rfc1122 4.2.3.9 allows to consider as hard errors
1009          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1010          * but it is obsoleted by pmtu discovery).
1011          *
1012          * Note, that in modern internet, where routing is unreliable
1013          * and in each dark corner broken firewalls sit, sending random
1014          * errors ordered by their masters even this two messages finally lose
1015          * their original sense (even Linux sends invalid PORT_UNREACHs)
1016          *
1017          * Now we are in compliance with RFCs.
1018          *                                                      --ANK (980905)
1019          */
1020
1021         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1022                 sk->err = err;
1023                 sk->error_report(sk);
1024         } else  { /* Only an error on timeout */
1025                 sk->err_soft = err;
1026         }
1027
1028 out:
1029         bh_unlock_sock(sk);
1030         sock_put(sk);
1031 }
1032
1033 /* This routine computes an IPv4 TCP checksum. */
1034 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1035                        struct sk_buff *skb)
1036 {
1037         th->check = 0;
1038         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1039                                  csum_partial((char *)th, th->doff<<2, skb->csum));
1040 }
1041
1042 /*
1043  *      This routine will send an RST to the other tcp.
1044  *
1045  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1046  *                    for reset.
1047  *      Answer: if a packet caused RST, it is not for a socket
1048  *              existing in our system, if it is matched to a socket,
1049  *              it is just duplicate segment or bug in other side's TCP.
1050  *              So that we build reply only basing on parameters
1051  *              arrived with segment.
1052  *      Exception: precedence violation. We do not implement it in any case.
1053  */
1054
1055 static void tcp_v4_send_reset(struct sk_buff *skb)
1056 {
1057         struct tcphdr *th = skb->h.th;
1058         struct tcphdr rth;
1059         struct ip_reply_arg arg;
1060
1061         /* Never send a reset in response to a reset. */
1062         if (th->rst)
1063                 return;
1064
1065         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1066                 return;
1067
1068         /* Swap the send and the receive. */
1069         memset(&rth, 0, sizeof(struct tcphdr));
1070         rth.dest = th->source;
1071         rth.source = th->dest;
1072         rth.doff = sizeof(struct tcphdr)/4;
1073         rth.rst = 1;
1074
1075         if (th->ack) {
1076                 rth.seq = th->ack_seq;
1077         } else {
1078                 rth.ack = 1;
1079                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1080                                     + skb->len - (th->doff<<2));
1081         }
1082
1083         memset(&arg, 0, sizeof arg);
1084         arg.iov[0].iov_base = (unsigned char *)&rth;
1085         arg.iov[0].iov_len  = sizeof rth;
1086         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1087                                       skb->nh.iph->saddr, /*XXX*/
1088                                       sizeof(struct tcphdr),
1089                                       IPPROTO_TCP,
1090                                       0);
1091         arg.n_iov = 1;
1092         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1093
1094         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1095
1096         TCP_INC_STATS_BH(TcpOutSegs);
1097         TCP_INC_STATS_BH(TcpOutRsts);
1098 }
1099
1100 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1101    outside socket context is ugly, certainly. What can I do?
1102  */
1103
1104 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1105 {
1106         struct tcphdr *th = skb->h.th;
1107         struct {
1108                 struct tcphdr th;
1109                 u32 tsopt[3];
1110         } rep;
1111         struct ip_reply_arg arg;
1112
1113         memset(&rep.th, 0, sizeof(struct tcphdr));
1114         memset(&arg, 0, sizeof arg);
1115
1116         arg.iov[0].iov_base = (unsigned char *)&rep;
1117         arg.iov[0].iov_len  = sizeof(rep.th);
1118         arg.n_iov = 1;
1119         if (ts) {
1120                 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1121                                                 (TCPOPT_NOP << 16) |
1122                                                 (TCPOPT_TIMESTAMP << 8) |
1123                                                 TCPOLEN_TIMESTAMP);
1124                 rep.tsopt[1] = htonl(tcp_time_stamp);
1125                 rep.tsopt[2] = htonl(ts);
1126                 arg.iov[0].iov_len = sizeof(rep);
1127         }
1128
1129         /* Swap the send and the receive. */
1130         rep.th.dest = th->source;
1131         rep.th.source = th->dest;
1132         rep.th.doff = arg.iov[0].iov_len/4;
1133         rep.th.seq = htonl(seq);
1134         rep.th.ack_seq = htonl(ack);
1135         rep.th.ack = 1;
1136         rep.th.window = htons(win);
1137
1138         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1139                                       skb->nh.iph->saddr, /*XXX*/
1140                                       arg.iov[0].iov_len,
1141                                       IPPROTO_TCP,
1142                                       0);
1143         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1144
1145         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1146
1147         TCP_INC_STATS_BH(TcpOutSegs);
1148 }
1149
1150 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1151 {
1152         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1153
1154         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1155                         tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1156
1157         tcp_tw_put(tw);
1158 }
1159
1160 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1161 {
1162         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1163                         req->ts_recent);
1164 }
1165
1166 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1167 {
1168         struct rtable *rt;
1169         struct ip_options *opt;
1170
1171         opt = req->af.v4_req.opt;
1172         if(ip_route_output(&rt, ((opt && opt->srr) ?
1173                                  opt->faddr :
1174                                  req->af.v4_req.rmt_addr),
1175                            req->af.v4_req.loc_addr,
1176                            RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1177                            sk->bound_dev_if)) {
1178                 IP_INC_STATS_BH(IpOutNoRoutes);
1179                 return NULL;
1180         }
1181         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1182                 ip_rt_put(rt);
1183                 IP_INC_STATS_BH(IpOutNoRoutes);
1184                 return NULL;
1185         }
1186         return &rt->u.dst;
1187 }
1188
1189 /*
1190  *      Send a SYN-ACK after having received an ACK.
1191  *      This still operates on a open_request only, not on a big
1192  *      socket.
1193  */
1194 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1195                               struct dst_entry *dst)
1196 {
1197         int err = -1;
1198         struct sk_buff * skb;
1199
1200         /* First, grab a route. */
1201         if (dst == NULL &&
1202             (dst = tcp_v4_route_req(sk, req)) == NULL)
1203                 goto out;
1204
1205         skb = tcp_make_synack(sk, dst, req);
1206
1207         if (skb) {
1208                 struct tcphdr *th = skb->h.th;
1209
1210                 th->check = tcp_v4_check(th, skb->len,
1211                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1212                                          csum_partial((char *)th, skb->len, skb->csum));
1213
1214                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1215                                             req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1216                 if (err == NET_XMIT_CN)
1217                         err = 0;
1218         }
1219
1220 out:
1221         dst_release(dst);
1222         return err;
1223 }
1224
1225 /*
1226  *      IPv4 open_request destructor.
1227  */
1228 static void tcp_v4_or_free(struct open_request *req)
1229 {
1230         if (req->af.v4_req.opt)
1231                 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1232 }
1233
1234 static inline void syn_flood_warning(struct sk_buff *skb)
1235 {
1236         static unsigned long warntime;
1237
1238         if (jiffies - warntime > HZ*60) {
1239                 warntime = jiffies;
1240                 printk(KERN_INFO
1241                        "possible SYN flooding on port %d. Sending cookies.\n",
1242                        ntohs(skb->h.th->dest));
1243         }
1244 }
1245
1246 /*
1247  * Save and compile IPv4 options into the open_request if needed.
1248  */
1249 static inline struct ip_options *
1250 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1251 {
1252         struct ip_options *opt = &(IPCB(skb)->opt);
1253         struct ip_options *dopt = NULL;
1254
1255         if (opt && opt->optlen) {
1256                 int opt_size = optlength(opt);
1257                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1258                 if (dopt) {
1259                         if (ip_options_echo(dopt, skb)) {
1260                                 kfree_s(dopt, opt_size);
1261                                 dopt = NULL;
1262                         }
1263                 }
1264         }
1265         return dopt;
1266 }
1267
1268 /*
1269  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1270  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1271  * It would be better to replace it with a global counter for all sockets
1272  * but then some measure against one socket starving all other sockets
1273  * would be needed.
1274  *
1275  * It was 128 by default. Experiments with real servers show, that
1276  * it is absolutely not enough even at 100conn/sec. 256 cures most
1277  * of problems. This value is adjusted to 128 for very small machines
1278  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1279  * Further increasing requires to change hash table size.
1280  */
1281 int sysctl_max_syn_backlog = 256;
1282
1283 struct or_calltable or_ipv4 = {
1284         PF_INET,
1285         tcp_v4_send_synack,
1286         tcp_v4_or_send_ack,
1287         tcp_v4_or_free,
1288         tcp_v4_send_reset
1289 };
1290
1291 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1292 {
1293         struct tcp_opt tp;
1294         struct open_request *req;
1295         struct tcphdr *th = skb->h.th;
1296         __u32 saddr = skb->nh.iph->saddr;
1297         __u32 daddr = skb->nh.iph->daddr;
1298         __u32 isn = TCP_SKB_CB(skb)->when;
1299         struct dst_entry *dst = NULL;
1300 #ifdef CONFIG_SYN_COOKIES
1301         int want_cookie = 0;
1302 #else
1303 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1304 #endif
1305
1306         /* Never answer to SYNs send to broadcast or multicast */
1307         if (((struct rtable *)skb->dst)->rt_flags &
1308             (RTCF_BROADCAST|RTCF_MULTICAST))
1309                 goto drop;
1310
1311         /* TW buckets are converted to open requests without
1312          * limitations, they conserve resources and peer is
1313          * evidently real one.
1314          */
1315         if (tcp_synq_is_full(sk) && !isn) {
1316 #ifdef CONFIG_SYN_COOKIES
1317                 if (sysctl_tcp_syncookies) {
1318                         want_cookie = 1;
1319                 } else
1320 #endif
1321                 goto drop;
1322         }
1323
1324         /* Accept backlog is full. If we have already queued enough
1325          * of warm entries in syn queue, drop request. It is better than
1326          * clogging syn queue with openreqs with exponentially increasing
1327          * timeout.
1328          */
1329         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1330                 goto drop;
1331
1332         req = tcp_openreq_alloc();
1333         if (req == NULL)
1334                 goto drop;
1335
1336         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1337         tp.mss_clamp = 536;
1338         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1339
1340         tcp_parse_options(NULL, th, &tp, want_cookie);
1341
1342         tcp_openreq_init(req, &tp, skb);
1343
1344         req->af.v4_req.loc_addr = daddr;
1345         req->af.v4_req.rmt_addr = saddr;
1346         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1347         req->class = &or_ipv4;
1348
1349         if (want_cookie) {
1350 #ifdef CONFIG_SYN_COOKIES
1351                 syn_flood_warning(skb);
1352 #endif
1353                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1354         } else if (isn == 0) {
1355                 struct inet_peer *peer = NULL;
1356
1357                 /* VJ's idea. We save last timestamp seen
1358                  * from the destination in peer table, when entering
1359                  * state TIME-WAIT, and check against it before
1360                  * accepting new connection request.
1361                  *
1362                  * If "isn" is not zero, this request hit alive
1363                  * timewait bucket, so that all the necessary checks
1364                  * are made in the function processing timewait state.
1365                  */
1366                 if (tp.saw_tstamp &&
1367                     sysctl_tcp_tw_recycle &&
1368                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1369                     (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1370                     peer->v4daddr == saddr) {
1371                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1372                             (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1373                                 NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source)));
1374                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1375                                 dst_release(dst);
1376                                 goto drop_and_free;
1377                         }
1378                 }
1379                 /* Kill the following clause, if you dislike this way. */
1380                 else if (!sysctl_tcp_syncookies &&
1381                          (sysctl_max_syn_backlog - tcp_synq_len(sk)
1382                           < (sysctl_max_syn_backlog>>2)) &&
1383                          (!peer || !peer->tcp_ts_stamp) &&
1384                          (!dst || !dst->rtt)) {
1385                         /* Without syncookies last quarter of
1386                          * backlog is filled with destinations, proven to be alive.
1387                          * It means that we continue to communicate
1388                          * to destinations, already remembered
1389                          * to the moment of synflood.
1390                          */
1391                         NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source)));
1392                         TCP_INC_STATS_BH(TcpAttemptFails);
1393                         dst_release(dst);
1394                         goto drop_and_free;
1395                 }
1396
1397                 isn = tcp_v4_init_sequence(sk, skb);
1398         }
1399         req->snt_isn = isn;
1400
1401         if (tcp_v4_send_synack(sk, req, dst))
1402                 goto drop_and_free;
1403
1404         if (want_cookie) {
1405                 tcp_openreq_free(req);
1406         } else {
1407                 tcp_v4_synq_add(sk, req);
1408         }
1409         return 0;
1410
1411 drop_and_free:
1412         tcp_openreq_free(req);
1413 drop:
1414         TCP_INC_STATS_BH(TcpAttemptFails);
1415         return 0;
1416 }
1417
1418
1419 /*
1420  * The three way handshake has completed - we got a valid synack -
1421  * now create the new socket.
1422  */
1423 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1424                                    struct open_request *req,
1425                                    struct dst_entry *dst)
1426 {
1427         struct tcp_opt *newtp;
1428         struct sock *newsk;
1429
1430         if (tcp_acceptq_is_full(sk))
1431                 goto exit_overflow;
1432
1433         if (dst == NULL &&
1434             (dst = tcp_v4_route_req(sk, req)) == NULL)
1435                 goto exit;
1436
1437         newsk = tcp_create_openreq_child(sk, req, skb);
1438         if (!newsk)
1439                 goto exit;
1440
1441         newsk->dst_cache = dst;
1442
1443         newtp = &(newsk->tp_pinfo.af_tcp);
1444         newsk->daddr = req->af.v4_req.rmt_addr;
1445         newsk->saddr = req->af.v4_req.loc_addr;
1446         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1447         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1448         req->af.v4_req.opt = NULL;
1449         newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1450         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1451         newtp->ext_header_len = 0;
1452         if (newsk->protinfo.af_inet.opt)
1453                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1454
1455         tcp_sync_mss(newsk, dst->pmtu);
1456         tcp_initialize_rcv_mss(newsk);
1457         newtp->advmss = dst->advmss;
1458
1459         tcp_init_buffer_space(newsk);
1460
1461         __tcp_v4_hash(newsk);
1462         __tcp_inherit_port(sk, newsk);
1463
1464         return newsk;
1465
1466 exit_overflow:
1467         NET_INC_STATS_BH(ListenOverflows);
1468 exit:
1469         NET_INC_STATS_BH(ListenDrops);
1470         dst_release(dst);
1471         return NULL;
1472 }
1473
1474 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1475 {
1476         struct open_request *req, **prev;
1477         struct tcphdr *th = skb->h.th;
1478         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1479
1480         /* Find possible connection requests. */
1481         req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1482         if (req)
1483                 return tcp_check_req(sk, skb, req, prev);
1484
1485         if (tp->accept_queue) {
1486                 struct sock *nsk;
1487
1488                 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1489                                                   th->source,
1490                                                   skb->nh.iph->daddr,
1491                                                   ntohs(th->dest),
1492                                                   tcp_v4_iif(skb));
1493
1494                 if (nsk) {
1495                         if (nsk->state != TCP_TIME_WAIT) {
1496                                 bh_lock_sock(nsk);
1497                                 return nsk;
1498                         }
1499                         tcp_tw_put((struct tcp_tw_bucket*)sk);
1500                         return NULL;
1501                 }
1502         }
1503
1504 #ifdef CONFIG_SYN_COOKIES
1505         if (!th->rst && (th->syn || th->ack))
1506                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1507 #endif
1508         return sk;
1509 }
1510
1511 static int tcp_v4_checksum_init(struct sk_buff *skb)
1512 {
1513         if (skb->ip_summed == CHECKSUM_HW) {
1514                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1515                                  skb->nh.iph->daddr,skb->csum)) {
1516                         NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1517                         return -1;
1518                 }
1519                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1520         } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
1521                 if (skb->len <= 68) {
1522                         if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1523                                          skb->nh.iph->daddr,
1524                                          csum_partial((char *)skb->h.th, skb->len, 0)))
1525                                 return -1;
1526                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1527                 } else {
1528                         skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1529                                                   skb->nh.iph->daddr,0);
1530                 }
1531         }
1532         return 0;
1533 }
1534
1535
1536 /* The socket must have it's spinlock held when we get
1537  * here.
1538  *
1539  * We have a potential double-lock case here, so even when
1540  * doing backlog processing we use the BH locking scheme.
1541  * This is because we cannot sleep with the original spinlock
1542  * held.
1543  */
1544 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1545 {
1546 #ifdef CONFIG_FILTER
1547         struct sk_filter *filter = sk->filter;
1548         if (filter && sk_filter(skb, filter))
1549                 goto discard;
1550 #endif /* CONFIG_FILTER */
1551
1552         IP_INC_STATS_BH(IpInDelivers);
1553
1554         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1555                 TCP_CHECK_TIMER(sk);
1556                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1557                         goto reset;
1558                 TCP_CHECK_TIMER(sk);
1559                 return 0;
1560         }
1561
1562         if (tcp_checksum_complete(skb))
1563                 goto csum_err;
1564
1565         if (sk->state == TCP_LISTEN) {
1566                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1567                 if (!nsk)
1568                         goto discard;
1569
1570                 if (nsk != sk) {
1571                         if (tcp_child_process(sk, nsk, skb))
1572                                 goto reset;
1573                         return 0;
1574                 }
1575         }
1576
1577         TCP_CHECK_TIMER(sk);
1578         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1579                 goto reset;
1580         TCP_CHECK_TIMER(sk);
1581         return 0;
1582
1583 reset:
1584         tcp_v4_send_reset(skb);
1585 discard:
1586         kfree_skb(skb);
1587         /* Be careful here. If this function gets more complicated and
1588          * gcc suffers from register pressure on the x86, sk (in %ebx)
1589          * might be destroyed here. This current version compiles correctly,
1590          * but you have been warned.
1591          */
1592         return 0;
1593
1594 csum_err:
1595         TCP_INC_STATS_BH(TcpInErrs);
1596         goto discard;
1597 }
1598
1599 /*
1600  *      From tcp_input.c
1601  */
1602
1603 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1604 {
1605         struct tcphdr *th;
1606         struct sock *sk;
1607         int ret;
1608
1609         if (skb->pkt_type!=PACKET_HOST)
1610                 goto discard_it;
1611
1612         th = skb->h.th;
1613
1614         /* Pull up the IP header. */
1615         __skb_pull(skb, skb->h.raw - skb->data);
1616
1617         /* Count it even if it's bad */
1618         TCP_INC_STATS_BH(TcpInSegs);
1619
1620         if (len < sizeof(struct tcphdr))
1621                 goto bad_packet;
1622
1623         if (tcp_v4_checksum_init(skb) < 0)
1624                 goto bad_packet;
1625
1626         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1627         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1628                                     len - th->doff*4);
1629         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1630         TCP_SKB_CB(skb)->when = 0;
1631         skb->used = 0;
1632
1633         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1634                              skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1635
1636         if (!sk)
1637                 goto no_tcp_socket;
1638
1639 process:
1640         if(!ipsec_sk_policy(sk,skb))
1641                 goto discard_and_relse;
1642
1643         if (sk->state == TCP_TIME_WAIT)
1644                 goto do_time_wait;
1645
1646         bh_lock_sock(sk);
1647         ret = 0;
1648         if (!sk->lock.users) {
1649                 if (!tcp_prequeue(sk, skb))
1650                         ret = tcp_v4_do_rcv(sk, skb);
1651         } else
1652                 sk_add_backlog(sk, skb);
1653         bh_unlock_sock(sk);
1654
1655         sock_put(sk);
1656
1657         return ret;
1658
1659 no_tcp_socket:
1660         if (tcp_checksum_complete(skb)) {
1661 bad_packet:
1662                 TCP_INC_STATS_BH(TcpInErrs);
1663         } else {
1664                 tcp_v4_send_reset(skb);
1665         }
1666
1667 discard_it:
1668         /* Discard frame. */
1669         kfree_skb(skb);
1670         return 0;
1671
1672 discard_and_relse:
1673         sock_put(sk);
1674         goto discard_it;
1675
1676 do_time_wait:
1677         if (tcp_checksum_complete(skb)) {
1678                 TCP_INC_STATS_BH(TcpInErrs);
1679                 goto discard_and_relse;
1680         }
1681         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1682                                           skb, th, skb->len)) {
1683         case TCP_TW_SYN:
1684         {
1685                 struct sock *sk2;
1686
1687                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1688                 if (sk2 != NULL) {
1689                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1690                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1691                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1692                         sk = sk2;
1693                         goto process;
1694                 }
1695                 /* Fall through to ACK */
1696         }
1697         case TCP_TW_ACK:
1698                 tcp_v4_timewait_ack(sk, skb);
1699                 break;
1700         case TCP_TW_RST:
1701                 goto no_tcp_socket;
1702         case TCP_TW_SUCCESS:
1703         }
1704         goto discard_it;
1705 }
1706
1707 /* With per-bucket locks this operation is not-atomic, so that
1708  * this version is not worse.
1709  */
1710 static void __tcp_v4_rehash(struct sock *sk)
1711 {
1712         sk->prot->unhash(sk);
1713         sk->prot->hash(sk);
1714 }
1715
1716 int tcp_v4_rebuild_header(struct sock *sk)
1717 {
1718         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1719         __u32 new_saddr;
1720         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1721
1722         if (rt == NULL) {
1723                 int err;
1724
1725                 u32 daddr = sk->daddr;
1726
1727                 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1728                         daddr = sk->protinfo.af_inet.opt->faddr;
1729
1730                 err = ip_route_output(&rt, daddr, sk->saddr,
1731                                       RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1732                                       sk->bound_dev_if);
1733                 if (err) {
1734                         sk->err_soft=-err;
1735                         sk->error_report(sk);
1736                         return -1;
1737                 }
1738                 __sk_dst_set(sk, &rt->u.dst);
1739         }
1740
1741         /* Force route checking if want_rewrite.
1742          * The idea is good, the implementation is disguisting.
1743          * Well, if I made bind on this socket, you cannot randomly ovewrite
1744          * its source address. --ANK
1745          */
1746         if (want_rewrite) {
1747                 int tmp;
1748                 struct rtable *new_rt;
1749                 __u32 old_saddr = rt->rt_src;
1750
1751                 /* Query new route using another rt buffer */
1752                 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1753                                         RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1754                                         sk->bound_dev_if);
1755
1756                 /* Only useful if different source addrs */
1757                 if (tmp == 0) {
1758                         /*
1759                          *      Only useful if different source addrs
1760                          */
1761                         if (new_rt->rt_src != old_saddr ) {
1762                                 __sk_dst_set(sk, &new_rt->u.dst);
1763                                 rt = new_rt;
1764                                 goto do_rewrite;
1765                         }
1766                         dst_release(&new_rt->u.dst);
1767                 }
1768         }
1769
1770         return 0;
1771
1772 do_rewrite:
1773         new_saddr = rt->rt_src;
1774
1775         /* Ouch!, this should not happen. */
1776         if (!sk->saddr || !sk->rcv_saddr) {
1777                 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1778                        "saddr=%08X rcv_saddr=%08X\n",
1779                        ntohl(sk->saddr),
1780                        ntohl(sk->rcv_saddr));
1781                 return -1;
1782         }
1783
1784         if (new_saddr != sk->saddr) {
1785                 if (sysctl_ip_dynaddr > 1) {
1786                         printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1787                                "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1788                                NIPQUAD(sk->saddr),
1789                                NIPQUAD(new_saddr));
1790                 }
1791
1792                 sk->saddr = new_saddr;
1793                 sk->rcv_saddr = new_saddr;
1794
1795                 /* XXX The only one ugly spot where we need to
1796                  * XXX really change the sockets identity after
1797                  * XXX it has entered the hashes. -DaveM
1798                  *
1799                  * Besides that, it does not check for connection
1800                  * uniqueness. Wait for troubles.
1801                  */
1802                 __tcp_v4_rehash(sk);
1803         }
1804
1805         return 0;
1806 }
1807
1808 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1809 {
1810         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1811
1812         sin->sin_family         = AF_INET;
1813         sin->sin_addr.s_addr    = sk->daddr;
1814         sin->sin_port           = sk->dport;
1815 }
1816
1817 /* VJ's idea. Save last timestamp seen from this destination
1818  * and hold it at least for normal timewait interval to use for duplicate
1819  * segment detection in subsequent connections, before they enter synchronized
1820  * state.
1821  */
1822
1823 int tcp_v4_remember_stamp(struct sock *sk)
1824 {
1825         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1826         struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1827         struct inet_peer *peer = NULL;
1828         int release_it = 0;
1829
1830         if (rt == NULL || rt->rt_dst != sk->daddr) {
1831                 peer = inet_getpeer(sk->daddr, 1);
1832                 release_it = 1;
1833         } else {
1834                 if (rt->peer == NULL)
1835                         rt_bind_peer(rt, 1);
1836                 peer = rt->peer;
1837         }
1838
1839         if (peer) {
1840                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1841                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1842                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1843                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
1844                         peer->tcp_ts = tp->ts_recent;
1845                 }
1846                 if (release_it)
1847                         inet_putpeer(peer);
1848                 return 1;
1849         }
1850
1851         return 0;
1852 }
1853
1854 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1855 {
1856         struct inet_peer *peer = NULL;
1857
1858         peer = inet_getpeer(tw->daddr, 1);
1859
1860         if (peer) {
1861                 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1862                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1863                      peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1864                         peer->tcp_ts_stamp = tw->ts_recent_stamp;
1865                         peer->tcp_ts = tw->ts_recent;
1866                 }
1867                 inet_putpeer(peer);
1868                 return 1;
1869         }
1870
1871         return 0;
1872 }
1873
1874 struct tcp_func ipv4_specific = {
1875         ip_queue_xmit,
1876         tcp_v4_send_check,
1877         tcp_v4_rebuild_header,
1878         tcp_v4_conn_request,
1879         tcp_v4_syn_recv_sock,
1880         tcp_v4_hash_connecting,
1881         tcp_v4_remember_stamp,
1882         sizeof(struct iphdr),
1883
1884         ip_setsockopt,
1885         ip_getsockopt,
1886         v4_addr2sockaddr,
1887         sizeof(struct sockaddr_in)
1888 };
1889
1890 /* NOTE: A lot of things set to zero explicitly by call to
1891  *       sk_alloc() so need not be done here.
1892  */
1893 static int tcp_v4_init_sock(struct sock *sk)
1894 {
1895         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1896
1897         skb_queue_head_init(&tp->out_of_order_queue);
1898         tcp_init_xmit_timers(sk);
1899         tcp_prequeue_init(tp);
1900
1901         tp->rto  = TCP_TIMEOUT_INIT;
1902         tp->mdev = TCP_TIMEOUT_INIT;
1903
1904         /* So many TCP implementations out there (incorrectly) count the
1905          * initial SYN frame in their delayed-ACK and congestion control
1906          * algorithms that we must have the following bandaid to talk
1907          * efficiently to them.  -DaveM
1908          */
1909         tp->snd_cwnd = 2;
1910
1911         /* See draft-stevens-tcpca-spec-01 for discussion of the
1912          * initialization of these values.
1913          */
1914         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1915         tp->snd_cwnd_clamp = ~0;
1916         tp->mss_cache = 536;
1917
1918         sk->state = TCP_CLOSE;
1919
1920         sk->write_space = tcp_write_space;
1921
1922         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1923
1924         return 0;
1925 }
1926
1927 static int tcp_v4_destroy_sock(struct sock *sk)
1928 {
1929         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1930
1931         tcp_clear_xmit_timers(sk);
1932
1933         /* Cleanup up the write buffer. */
1934         __skb_queue_purge(&sk->write_queue);
1935
1936         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1937         __skb_queue_purge(&tp->out_of_order_queue);
1938
1939         /* Clean prequeue, it must be empty really */
1940         __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942         /* Clean up a referenced TCP bind bucket. */
1943         if(sk->prev != NULL)
1944                 tcp_put_port(sk);
1945
1946         return 0;
1947 }
1948
1949 /* Proc filesystem TCP sock list dumping. */
1950 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
1951 {
1952         int ttd = req->expires - jiffies;
1953
1954         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1955                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1956                 i,
1957                 req->af.v4_req.loc_addr,
1958                 ntohs(sk->sport),
1959                 req->af.v4_req.rmt_addr,
1960                 ntohs(req->rmt_port),
1961                 TCP_SYN_RECV,
1962                 0,0, /* could print option size, but that is af dependent. */
1963                 1,   /* timers active (only the expire timer) */
1964                 ttd,
1965                 req->retrans,
1966                 sk->socket ? sk->socket->inode->i_uid : 0,
1967                 0,  /* non standard timer */
1968                 0, /* open_requests have no inode */
1969                 atomic_read(&sk->refcnt),
1970                 req
1971                 );
1972 }
1973
1974 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1975 {
1976         unsigned int dest, src;
1977         __u16 destp, srcp;
1978         int timer_active;
1979         unsigned long timer_expires;
1980         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1981
1982         dest  = sp->daddr;
1983         src   = sp->rcv_saddr;
1984         destp = ntohs(sp->dport);
1985         srcp  = ntohs(sp->sport);
1986         timer_active    = 0;
1987         timer_expires   = (unsigned) -1;
1988         if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) {
1989                 timer_active    = 1;
1990                 timer_expires   = tp->retransmit_timer.expires;
1991         } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) {
1992                 timer_active    = 4;
1993                 timer_expires   = tp->probe_timer.expires;
1994         }
1995         if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) {
1996                 timer_active    = 2;
1997                 timer_expires   = sp->timer.expires;
1998         }
1999         if(timer_active == 0)
2000                 timer_expires = jiffies;
2001
2002         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2003                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u",
2004                 i, src, srcp, dest, destp, sp->state,
2005                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2006                 timer_active, timer_expires-jiffies,
2007                 tp->retransmits,
2008                 sp->socket ? sp->socket->inode->i_uid : 0,
2009                 tp->probes_out,
2010                 sp->socket ? sp->socket->inode->i_ino : 0,
2011                 atomic_read(&sp->refcnt), sp,
2012                 tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong
2013                 );
2014 }
2015
2016 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2017 {
2018         unsigned int dest, src;
2019         __u16 destp, srcp;
2020         int ttd = tw->ttd - jiffies;
2021
2022         if (ttd < 0)
2023                 ttd = 0;
2024
2025         dest  = tw->daddr;
2026         src   = tw->rcv_saddr;
2027         destp = ntohs(tw->dport);
2028         srcp  = ntohs(tw->sport);
2029
2030         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2031                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2032                 i, src, srcp, dest, destp, tw->substate, 0, 0,
2033                 3, ttd, 0, 0, 0, 0,
2034                 atomic_read(&tw->refcnt), tw);
2035 }
2036
2037 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2038 {
2039         int len = 0, num = 0, i;
2040         off_t begin, pos = 0;
2041         char tmpbuf[129];
2042
2043         if (offset < 128)
2044                 len += sprintf(buffer, "%-127s\n",
2045                                "  sl  local_address rem_address   st tx_queue "
2046                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2047
2048         pos = 128;
2049
2050         /* First, walk listening socket table. */
2051         tcp_listen_lock();
2052         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2053                 struct sock *sk = tcp_listening_hash[i];
2054                 struct tcp_listen_opt *lopt;
2055                 int k;
2056
2057                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2058                         struct open_request *req;
2059                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2060
2061                         if (!TCP_INET_FAMILY(sk->family))
2062                                 goto skip_listen;
2063
2064                         pos += 128;
2065                         if (pos >= offset) {
2066                                 get_tcp_sock(sk, tmpbuf, num);
2067                                 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2068                                 if (len >= length) {
2069                                         tcp_listen_unlock();
2070                                         goto out_no_bh;
2071                                 }
2072                         }
2073
2074 skip_listen:
2075                         read_lock_bh(&tp->syn_wait_lock);
2076                         lopt = tp->listen_opt;
2077                         if (lopt && lopt->qlen != 0) {
2078                                 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2079                                         for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2080                                                 if (!TCP_INET_FAMILY(req->class->family))
2081                                                         continue;
2082
2083                                                 pos += 128;
2084                                                 if (pos < offset)
2085                                                         continue;
2086                                                 get_openreq(sk, req, tmpbuf, num);
2087                                                 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2088                                                 if(len >= length) {
2089                                                         read_unlock_bh(&tp->syn_wait_lock);
2090                                                         tcp_listen_unlock();
2091                                                         goto out_no_bh;
2092                                                 }
2093                                         }
2094                                 }
2095                         }
2096                         read_unlock_bh(&tp->syn_wait_lock);
2097
2098                         /* Completed requests are in normal socket hash table */
2099                 }
2100         }
2101         tcp_listen_unlock();
2102
2103         local_bh_disable();
2104
2105         /* Next, walk established hash chain. */
2106         for (i = 0; i < tcp_ehash_size; i++) {
2107                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2108                 struct sock *sk;
2109                 struct tcp_tw_bucket *tw;
2110
2111                 read_lock(&head->lock);
2112                 for(sk = head->chain; sk; sk = sk->next, num++) {
2113                         if (!TCP_INET_FAMILY(sk->family))
2114                                 continue;
2115                         pos += 128;
2116                         if (pos < offset)
2117                                 continue;
2118                         get_tcp_sock(sk, tmpbuf, num);
2119                         len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2120                         if(len >= length) {
2121                                 read_unlock(&head->lock);
2122                                 goto out;
2123                         }
2124                 }
2125                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2126                      tw != NULL;
2127                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2128                         if (!TCP_INET_FAMILY(tw->family))
2129                                 continue;
2130                         pos += 128;
2131                         if (pos < offset)
2132                                 continue;
2133                         get_timewait_sock(tw, tmpbuf, num);
2134                         len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2135                         if(len >= length) {
2136                                 read_unlock(&head->lock);
2137                                 goto out;
2138                         }
2139                 }
2140                 read_unlock(&head->lock);
2141         }
2142
2143 out:
2144         local_bh_enable();
2145 out_no_bh:
2146
2147         begin = len - (pos - offset);
2148         *start = buffer + begin;
2149         len -= begin;
2150         if(len > length)
2151                 len = length;
2152         if (len < 0)
2153                 len = 0;
2154         return len;
2155 }
2156
2157 struct proto tcp_prot = {
2158         tcp_close,                      /* close */
2159         tcp_v4_connect,                 /* connect */
2160         tcp_disconnect,                 /* disconnect */
2161         tcp_accept,                     /* accept */
2162         tcp_ioctl,                      /* ioctl */
2163         tcp_v4_init_sock,               /* init */
2164         tcp_v4_destroy_sock,            /* destroy */
2165         tcp_shutdown,                   /* shutdown */
2166         tcp_setsockopt,                 /* setsockopt */
2167         tcp_getsockopt,                 /* getsockopt */
2168         tcp_sendmsg,                    /* sendmsg */
2169         tcp_recvmsg,                    /* recvmsg */
2170         NULL,                           /* bind */
2171         tcp_v4_do_rcv,                  /* backlog_rcv */
2172         tcp_v4_hash,                    /* hash */
2173         tcp_unhash,                     /* unhash */
2174         tcp_v4_get_port,                /* get_port */
2175         "TCP",                          /* name */
2176 };
2177
2178
2179
2180 void __init tcp_v4_init(struct net_proto_family *ops)
2181 {
2182         int err;
2183
2184         tcp_inode.i_mode = S_IFSOCK;
2185         tcp_inode.i_sock = 1;
2186         tcp_inode.i_uid = 0;
2187         tcp_inode.i_gid = 0;
2188         init_waitqueue_head(&tcp_inode.i_wait);
2189         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2190
2191         tcp_socket->inode = &tcp_inode;
2192         tcp_socket->state = SS_UNCONNECTED;
2193         tcp_socket->type=SOCK_RAW;
2194
2195         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2196                 panic("Failed to create the TCP control socket.\n");
2197         tcp_socket->sk->allocation=GFP_ATOMIC;
2198         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2199
2200         /* Unhash it so that IP input processing does not even
2201          * see it, we do not wish this socket to see incoming
2202          * packets.
2203          */
2204         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2205 }