net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.212 2000/08/18 17:10:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/cache.h>
  55 #include <linux/init.h>
  56
  57 #include <net/icmp.h>
  58 #include <net/tcp.h>
  59 #include <net/ipv6.h>
  60 #include <net/inet_common.h>
  61
  62 #include <linux/inet.h>
  63 #include <linux/stddef.h>
  64 #include <linux/ipsec.h>
  65
  66 extern int sysctl_ip_dynaddr;
  67
  68 /* Check TCP sequence numbers in ICMP packets. */
  69 #define ICMP_MIN_LENGTH 8
  70
  71 /* Socket used for sending RSTs */
  72 static struct inode tcp_inode;
  73 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
  74
  75 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  76                        struct sk_buff *skb);
  77
  78 /* This is for sockets with full identity only.  Sockets here will always
  79  * be without wildcards and will have the following invariant:
  80  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  81  *
  82  * First half of the table is for sockets not in TIME_WAIT, second half
  83  * is for TIME_WAIT sockets only.
  84  */
  85 struct tcp_ehash_bucket *tcp_ehash = NULL;
  86
  87 /* Ok, let's try this, I give up, we do need a local binding
  88  * TCP hash as well as the others for fast bind/connect.
  89  */
  90 struct tcp_bind_hashbucket *tcp_bhash = NULL;
  91
  92 int tcp_bhash_size = 0;
  93 int tcp_ehash_size = 0;
  94
  95 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  96  * where wildcard'd TCP sockets can exist.  Hash function here is just local
  97  * port number.
  98  */
  99 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
 100 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
 101                                 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
 102                                   (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
 103
 104 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
 105 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
 106 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
 107
 108 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
 109
 110 /*
 111  * This array holds the first and last local port number.
 112  * For high-usage systems, use sysctl to change this to
 113  * 32768-61000
 114  */
 115 int sysctl_local_port_range[2] = { 1024, 4999 };
 116 int tcp_port_rover = (1024 - 1);
 117
 118 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 119                                  __u32 faddr, __u16 fport)
 120 {
 121         int h = ((laddr ^ lport) ^ (faddr ^ fport));
 122         h ^= h>>16;
 123         h ^= h>>8;
 124         return h & (tcp_ehash_size - 1);
 125 }
 126
 127 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 128 {
 129         __u32 laddr = sk->rcv_saddr;
 130         __u16 lport = sk->num;
 131         __u32 faddr = sk->daddr;
 132         __u16 fport = sk->dport;
 133
 134         return tcp_hashfn(laddr, lport, faddr, fport);
 135 }
 136
 137 /* Allocate and initialize a new TCP local port bind bucket.
 138  * The bindhash mutex for snum's hash chain must be held here.
 139  */
 140 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 141                                           unsigned short snum)
 142 {
 143         struct tcp_bind_bucket *tb;
 144
 145         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 146         if(tb != NULL) {
 147                 tb->port = snum;
 148                 tb->fastreuse = 0;
 149                 tb->owners = NULL;
 150                 if((tb->next = head->chain) != NULL)
 151                         tb->next->pprev = &tb->next;
 152                 head->chain = tb;
 153                 tb->pprev = &head->chain;
 154         }
 155         return tb;
 156 }
 157
 158 /* Caller must disable local BH processing. */
 159 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 160 {
 161         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
 162         struct tcp_bind_bucket *tb;
 163
 164         spin_lock(&head->lock);
 165         tb = (struct tcp_bind_bucket *)sk->prev;
 166         if ((child->bind_next = tb->owners) != NULL)
 167                 tb->owners->bind_pprev = &child->bind_next;
 168         tb->owners = child;
 169         child->bind_pprev = &tb->owners;
 170         child->prev = (struct sock *) tb;
 171         spin_unlock(&head->lock);
 172 }
 173
 174 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
 175 {
 176         local_bh_disable();
 177         __tcp_inherit_port(sk, child);
 178         local_bh_enable();
 179 }
 180
 181 /* Obtain a reference to a local port for the given sock,
 182  * if snum is zero it means select any available local port.
 183  */
 184 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 185 {
 186         struct tcp_bind_hashbucket *head;
 187         struct tcp_bind_bucket *tb;
 188         int ret;
 189
 190         local_bh_disable();
 191         if (snum == 0) {
 192                 int low = sysctl_local_port_range[0];
 193                 int high = sysctl_local_port_range[1];
 194                 int remaining = (high - low) + 1;
 195                 int rover;
 196
 197                 spin_lock(&tcp_portalloc_lock);
 198                 rover = tcp_port_rover;
 199                 do {    rover++;
 200                         if ((rover < low) || (rover > high))
 201                                 rover = low;
 202                         head = &tcp_bhash[tcp_bhashfn(rover)];
 203                         spin_lock(&head->lock);
 204                         for (tb = head->chain; tb; tb = tb->next)
 205                                 if (tb->port == rover)
 206                                         goto next;
 207                         break;
 208                 next:
 209                         spin_unlock(&head->lock);
 210                 } while (--remaining > 0);
 211                 tcp_port_rover = rover;
 212                 spin_unlock(&tcp_portalloc_lock);
 213
 214                 /* Exhausted local port range during search? */
 215                 ret = 1;
 216                 if (remaining <= 0)
 217                         goto fail;
 218
 219                 /* OK, here is the one we will use.  HEAD is
 220                  * non-NULL and we hold it's mutex.
 221                  */
 222                 snum = rover;
 223                 tb = NULL;
 224         } else {
 225                 head = &tcp_bhash[tcp_bhashfn(snum)];
 226                 spin_lock(&head->lock);
 227                 for (tb = head->chain; tb != NULL; tb = tb->next)
 228                         if (tb->port == snum)
 229                                 break;
 230         }
 231         if (tb != NULL && tb->owners != NULL) {
 232                 if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
 233                         goto success;
 234                 } else {
 235                         struct sock *sk2 = tb->owners;
 236                         int sk_reuse = sk->reuse;
 237
 238                         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
 239                                 if (sk != sk2 &&
 240                                     sk->bound_dev_if == sk2->bound_dev_if) {
 241                                         if (!sk_reuse   ||
 242                                             !sk2->reuse ||
 243                                             sk2->state == TCP_LISTEN) {
 244                                                 if (!sk2->rcv_saddr     ||
 245                                                     !sk->rcv_saddr      ||
 246                                                     (sk2->rcv_saddr == sk->rcv_saddr))
 247                                                         break;
 248                                         }
 249                                 }
 250                         }
 251                         /* If we found a conflict, fail. */
 252                         ret = 1;
 253                         if (sk2 != NULL)
 254                                 goto fail_unlock;
 255                 }
 256         }
 257         ret = 1;
 258         if (tb == NULL &&
 259             (tb = tcp_bucket_create(head, snum)) == NULL)
 260                         goto fail_unlock;
 261         if (tb->owners == NULL) {
 262                 if (sk->reuse && sk->state != TCP_LISTEN)
 263                         tb->fastreuse = 1;
 264                 else
 265                         tb->fastreuse = 0;
 266         } else if (tb->fastreuse &&
 267                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
 268                 tb->fastreuse = 0;
 269 success:
 270         sk->num = snum;
 271         if (sk->prev == NULL) {
 272                 if ((sk->bind_next = tb->owners) != NULL)
 273                         tb->owners->bind_pprev = &sk->bind_next;
 274                 tb->owners = sk;
 275                 sk->bind_pprev = &tb->owners;
 276                 sk->prev = (struct sock *) tb;
 277         } else {
 278                 BUG_TRAP(sk->prev == (struct sock *) tb);
 279         }
 280         ret = 0;
 281
 282 fail_unlock:
 283         spin_unlock(&head->lock);
 284 fail:
 285         local_bh_enable();
 286         return ret;
 287 }
 288
 289 /* Get rid of any references to a local port held by the
 290  * given sock.
 291  */
 292 __inline__ void __tcp_put_port(struct sock *sk)
 293 {
 294         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
 295         struct tcp_bind_bucket *tb;
 296
 297         spin_lock(&head->lock);
 298         tb = (struct tcp_bind_bucket *) sk->prev;
 299         if (sk->bind_next)
 300                 sk->bind_next->bind_pprev = sk->bind_pprev;
 301         *(sk->bind_pprev) = sk->bind_next;
 302         sk->prev = NULL;
 303         if (tb->owners == NULL) {
 304                 if (tb->next)
 305                         tb->next->pprev = tb->pprev;
 306                 *(tb->pprev) = tb->next;
 307                 kmem_cache_free(tcp_bucket_cachep, tb);
 308         }
 309         spin_unlock(&head->lock);
 310 }
 311
 312 void tcp_put_port(struct sock *sk)
 313 {
 314         local_bh_disable();
 315         __tcp_put_port(sk);
 316         local_bh_enable();
 317 }
 318
 319 /* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
 320  * Look, when several writers sleep and reader wakes them up, all but one
 321  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 322  * this, _but_ remember, it adds useless work on UP machines (wake up each
 323  * exclusive lock release). It should be ifdefed really.
 324  */
 325
 326 void tcp_listen_wlock(void)
 327 {
 328         write_lock(&tcp_lhash_lock);
 329
 330         if (atomic_read(&tcp_lhash_users)) {
 331                 DECLARE_WAITQUEUE(wait, current);
 332
 333                 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
 334                 for (;;) {
 335                         set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
 336                         if (atomic_read(&tcp_lhash_users) == 0)
 337                                 break;
 338                         write_unlock_bh(&tcp_lhash_lock);
 339                         schedule();
 340                         write_lock_bh(&tcp_lhash_lock);
 341                 }
 342
 343                 __set_current_state(TASK_RUNNING);
 344                 remove_wait_queue(&tcp_lhash_wait, &wait);
 345         }
 346 }
 347
 348 static __inline__ void __tcp_v4_hash(struct sock *sk)
 349 {
 350         struct sock **skp;
 351         rwlock_t *lock;
 352
 353         BUG_TRAP(sk->pprev==NULL);
 354         if(sk->state == TCP_LISTEN) {
 355                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 356                 lock = &tcp_lhash_lock;
 357                 tcp_listen_wlock();
 358         } else {
 359                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
 360                 lock = &tcp_ehash[sk->hashent].lock;
 361                 write_lock(lock);
 362         }
 363         if((sk->next = *skp) != NULL)
 364                 (*skp)->pprev = &sk->next;
 365         *skp = sk;
 366         sk->pprev = skp;
 367         sock_prot_inc_use(sk->prot);
 368         write_unlock(lock);
 369         if (sk->state == TCP_LISTEN)
 370                 wake_up(&tcp_lhash_wait);
 371 }
 372
 373 static void tcp_v4_hash(struct sock *sk)
 374 {
 375         if (sk->state != TCP_CLOSE) {
 376                 local_bh_disable();
 377                 __tcp_v4_hash(sk);
 378                 local_bh_enable();
 379         }
 380 }
 381
 382 void tcp_unhash(struct sock *sk)
 383 {
 384         rwlock_t *lock;
 385
 386         if (sk->state == TCP_LISTEN) {
 387                 local_bh_disable();
 388                 tcp_listen_wlock();
 389                 lock = &tcp_lhash_lock;
 390         } else {
 391                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
 392                 lock = &head->lock;
 393                 write_lock_bh(&head->lock);
 394         }
 395
 396         if(sk->pprev) {
 397                 if(sk->next)
 398                         sk->next->pprev = sk->pprev;
 399                 *sk->pprev = sk->next;
 400                 sk->pprev = NULL;
 401                 sock_prot_dec_use(sk->prot);
 402         }
 403         write_unlock_bh(lock);
 404         if (sk->state == TCP_LISTEN)
 405                 wake_up(&tcp_lhash_wait);
 406 }
 407
 408 /* Don't inline this cruft.  Here are some nice properties to
 409  * exploit here.  The BSD API does not allow a listening TCP
 410  * to specify the remote port nor the remote address for the
 411  * connection.  So always assume those are both wildcarded
 412  * during the search since they can never be otherwise.
 413  */
 414 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
 415 {
 416         struct sock *result = NULL;
 417         int score, hiscore;
 418
 419         hiscore=0;
 420         for(; sk; sk = sk->next) {
 421                 if(sk->num == hnum) {
 422                         __u32 rcv_saddr = sk->rcv_saddr;
 423
 424                         score = 1;
 425                         if(rcv_saddr) {
 426                                 if (rcv_saddr != daddr)
 427                                         continue;
 428                                 score++;
 429                         }
 430                         if (sk->bound_dev_if) {
 431                                 if (sk->bound_dev_if != dif)
 432                                         continue;
 433                                 score++;
 434                         }
 435                         if (score == 3)
 436                                 return sk;
 437                         if (score > hiscore) {
 438                                 hiscore = score;
 439                                 result = sk;
 440                         }
 441                 }
 442         }
 443         return result;
 444 }
 445
 446 /* Optimize the common listener case. */
 447 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 448 {
 449         struct sock *sk;
 450
 451         read_lock(&tcp_lhash_lock);
 452         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
 453         if (sk) {
 454                 if (sk->num == hnum &&
 455                     sk->next == NULL &&
 456                     (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
 457                     !sk->bound_dev_if)
 458                         goto sherry_cache;
 459                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
 460         }
 461         if (sk) {
 462 sherry_cache:
 463                 sock_hold(sk);
 464         }
 465         read_unlock(&tcp_lhash_lock);
 466         return sk;
 467 }
 468
 469 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 470  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 471  *
 472  * Local BH must be disabled here.
 473  */
 474
 475 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 476                                                        u32 daddr, u16 hnum, int dif)
 477 {
 478         struct tcp_ehash_bucket *head;
 479         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 480         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 481         struct sock *sk;
 482         int hash;
 483
 484         /* Optimize here for direct hit, only listening connections can
 485          * have wildcards anyways.
 486          */
 487         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 488         head = &tcp_ehash[hash];
 489         read_lock(&head->lock);
 490         for(sk = head->chain; sk; sk = sk->next) {
 491                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 492                         goto hit; /* You sunk my battleship! */
 493         }
 494
 495         /* Must check for a TIME_WAIT'er before going to listener hash. */
 496         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
 497                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 498                         goto hit;
 499         read_unlock(&head->lock);
 500
 501         return NULL;
 502
 503 hit:
 504         sock_hold(sk);
 505         read_unlock(&head->lock);
 506         return sk;
 507 }
 508
 509 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 510                                            u32 daddr, u16 hnum, int dif)
 511 {
 512         struct sock *sk;
 513
 514         sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
 515
 516         if (sk)
 517                 return sk;
 518
 519         return tcp_v4_lookup_listener(daddr, hnum, dif);
 520 }
 521
 522 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 523 {
 524         struct sock *sk;
 525
 526         local_bh_disable();
 527         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 528         local_bh_enable();
 529
 530         return sk;
 531 }
 532
 533 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 534 {
 535         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 536                                           skb->nh.iph->saddr,
 537                                           skb->h.th->dest,
 538                                           skb->h.th->source);
 539 }
 540
 541 static int tcp_v4_check_established(struct sock *sk)
 542 {
 543         u32 daddr = sk->rcv_saddr;
 544         u32 saddr = sk->daddr;
 545         int dif = sk->bound_dev_if;
 546         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 547         __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
 548         int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 549         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 550         struct sock *sk2, **skp;
 551         struct tcp_tw_bucket *tw;
 552
 553         write_lock_bh(&head->lock);
 554
 555         /* Check TIME-WAIT sockets first. */
 556         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 557             skp = &sk2->next) {
 558                 tw = (struct tcp_tw_bucket*)sk2;
 559
 560                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 561                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 562
 563                         /* With PAWS, it is safe from the viewpoint
 564                            of data integrity. Even without PAWS it
 565                            is safe provided sequence spaces do not
 566                            overlap i.e. at data rates <= 80Mbit/sec.
 567
 568                            Actually, the idea is close to VJ's one,
 569                            only timestamp cache is held not per host,
 570                            but per port pair and TW bucket is used
 571                            as state holder.
 572
 573                            If TW bucket has been already destroyed we
 574                            fall back to VJ's scheme and use initial
 575                            timestamp retrieved from peer table.
 576                          */
 577                         if (tw->ts_recent_stamp) {
 578                                 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
 579                                         tp->write_seq = 1;
 580                                 tp->ts_recent = tw->ts_recent;
 581                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
 582                                 sock_hold(sk2);
 583                                 skp = &head->chain;
 584                                 goto unique;
 585                         } else
 586                                 goto not_unique;
 587                 }
 588         }
 589         tw = NULL;
 590
 591         /* And established part... */
 592         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
 593                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 594                         goto not_unique;
 595         }
 596
 597 unique:
 598         BUG_TRAP(sk->pprev==NULL);
 599         if ((sk->next = *skp) != NULL)
 600                 (*skp)->pprev = &sk->next;
 601
 602         *skp = sk;
 603         sk->pprev = skp;
 604         sk->hashent = hash;
 605         sock_prot_inc_use(sk->prot);
 606         write_unlock_bh(&head->lock);
 607
 608         if (tw) {
 609                 /* Silly. Should hash-dance instead... */
 610                 local_bh_disable();
 611                 tcp_tw_deschedule(tw);
 612                 tcp_timewait_kill(tw);
 613                 NET_INC_STATS_BH(TimeWaitRecycled);
 614                 local_bh_enable();
 615
 616                 tcp_tw_put(tw);
 617         }
 618
 619         return 0;
 620
 621 not_unique:
 622         write_unlock_bh(&head->lock);
 623         return -EADDRNOTAVAIL;
 624 }
 625
 626 /* Hash SYN-SENT socket to established hash table after
 627  * checking that it is unique. Note, that without kernel lock
 628  * we MUST make these two operations atomically.
 629  *
 630  * Optimization: if it is bound and tcp_bind_bucket has the only
 631  * owner (us), we need not to scan established bucket.
 632  */
 633
 634 int tcp_v4_hash_connecting(struct sock *sk)
 635 {
 636         unsigned short snum = sk->num;
 637         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
 638         struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
 639
 640         spin_lock_bh(&head->lock);
 641         if (tb->owners == sk && sk->bind_next == NULL) {
 642                 __tcp_v4_hash(sk);
 643                 spin_unlock_bh(&head->lock);
 644                 return 0;
 645         } else {
 646                 spin_unlock_bh(&head->lock);
 647
 648                 /* No definite answer... Walk to established hash table */
 649                 return tcp_v4_check_established(sk);
 650         }
 651 }
 652
 653 /* This will initiate an outgoing connection. */
 654 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 655 {
 656         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 657         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 658         struct sk_buff *buff;
 659         struct rtable *rt;
 660         u32 daddr, nexthop;
 661         int tmp;
 662         int err;
 663
 664         if (addr_len < sizeof(struct sockaddr_in))
 665                 return(-EINVAL);
 666
 667         if (usin->sin_family != AF_INET)
 668                 return(-EAFNOSUPPORT);
 669
 670         nexthop = daddr = usin->sin_addr.s_addr;
 671         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
 672                 if (daddr == 0)
 673                         return -EINVAL;
 674                 nexthop = sk->protinfo.af_inet.opt->faddr;
 675         }
 676
 677         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 678                                RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 679         if (tmp < 0)
 680                 return tmp;
 681
 682         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 683                 ip_rt_put(rt);
 684                 return -ENETUNREACH;
 685         }
 686
 687         __sk_dst_set(sk, &rt->u.dst);
 688
 689         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
 690                 daddr = rt->rt_dst;
 691
 692         err = -ENOBUFS;
 693         buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
 694
 695         if (buff == NULL)
 696                 goto failure;
 697
 698         if (!sk->saddr)
 699                 sk->saddr = rt->rt_src;
 700         sk->rcv_saddr = sk->saddr;
 701
 702         if (tp->ts_recent_stamp && sk->daddr != daddr) {
 703                 /* Reset inherited state */
 704                 tp->ts_recent = 0;
 705                 tp->ts_recent_stamp = 0;
 706                 tp->write_seq = 0;
 707         }
 708
 709         if (sysctl_tcp_tw_recycle &&
 710             !tp->ts_recent_stamp &&
 711             rt->rt_dst == daddr) {
 712                 struct inet_peer *peer = rt_get_peer(rt);
 713
 714                 /* VJ's idea. We save last timestamp seen from
 715                  * the destination in peer table, when entering state TIME-WAIT
 716                  * and initialize ts_recent from it, when trying new connection.
 717                  */
 718
 719                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 720                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 721                         tp->ts_recent = peer->tcp_ts;
 722                 }
 723         }
 724
 725         sk->dport = usin->sin_port;
 726         sk->daddr = daddr;
 727
 728         if (!tp->write_seq)
 729                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 730                                                            sk->sport, usin->sin_port);
 731
 732         tp->ext_header_len = 0;
 733         if (sk->protinfo.af_inet.opt)
 734                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
 735
 736         tp->mss_clamp = 536;
 737
 738         err = tcp_connect(sk, buff);
 739         if (err == 0)
 740                 return 0;
 741
 742 failure:
 743         __sk_dst_reset(sk);
 744         sk->dport = 0;
 745         return err;
 746 }
 747
 748 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 749 {
 750         return ((struct rtable*)skb->dst)->rt_iif;
 751 }
 752
 753 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
 754 {
 755         unsigned h = raddr ^ rport;
 756         h ^= h>>16;
 757         h ^= h>>8;
 758         return h&(TCP_SYNQ_HSIZE-1);
 759 }
 760
 761 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 762                                               struct iphdr *iph,
 763                                               struct tcphdr *th,
 764                                               struct open_request ***prevp)
 765 {
 766         struct tcp_listen_opt *lopt = tp->listen_opt;
 767         struct open_request *req, **prev;
 768         __u16 rport = th->source;
 769         __u32 raddr = iph->saddr;
 770
 771         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
 772              (req = *prev) != NULL;
 773              prev = &req->dl_next) {
 774                 if (req->rmt_port == rport &&
 775                     req->af.v4_req.rmt_addr == raddr &&
 776                     req->af.v4_req.loc_addr == iph->daddr &&
 777                     TCP_INET_FAMILY(req->class->family)) {
 778                         BUG_TRAP(req->sk == NULL);
 779                         *prevp = prev;
 780                         return req;
 781                 }
 782         }
 783
 784         return NULL;
 785 }
 786
 787 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 788 {
 789         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 790         struct tcp_listen_opt *lopt = tp->listen_opt;
 791         unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
 792
 793         req->expires = jiffies + TCP_TIMEOUT_INIT;
 794         req->retrans = 0;
 795         req->sk = NULL;
 796         req->index = h;
 797         req->dl_next = lopt->syn_table[h];
 798
 799         write_lock(&tp->syn_wait_lock);
 800         lopt->syn_table[h] = req;
 801         write_unlock(&tp->syn_wait_lock);
 802
 803         tcp_synq_added(sk);
 804 }
 805
 806
 807 /*
 808  * This routine does path mtu discovery as defined in RFC1191.
 809  */
 810 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
 811 {
 812         struct dst_entry *dst;
 813         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 814
 815         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 816          * send out by Linux are always <576bytes so they should go through
 817          * unfragmented).
 818          */
 819         if (sk->state == TCP_LISTEN)
 820                 return;
 821
 822         /* We don't check in the destentry if pmtu discovery is forbidden
 823          * on this route. We just assume that no packet_to_big packets
 824          * are send back when pmtu discovery is not active.
 825          * There is a small race when the user changes this flag in the
 826          * route, but I think that's acceptable.
 827          */
 828         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 829                 return;
 830
 831         ip_rt_update_pmtu(dst, mtu);
 832
 833         /* Something is about to be wrong... Remember soft error
 834          * for the case, if this connection will not able to recover.
 835          */
 836         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
 837                 sk->err_soft = EMSGSIZE;
 838
 839         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
 840             tp->pmtu_cookie > dst->pmtu) {
 841                 tcp_sync_mss(sk, dst->pmtu);
 842
 843                 /* Resend the TCP packet because it's
 844                  * clear that the old packet has been
 845                  * dropped. This is the new "fast" path mtu
 846                  * discovery.
 847                  */
 848                 tcp_simple_retransmit(sk);
 849         } /* else let the usual retransmit timer handle it */
 850 }
 851
 852 /*
 853  * This routine is called by the ICMP module when it gets some
 854  * sort of error condition.  If err < 0 then the socket should
 855  * be closed and the error returned to the user.  If err > 0
 856  * it's just the icmp type << 8 | icmp code.  After adjustment
 857  * header points to the first 8 bytes of the tcp header.  We need
 858  * to find the appropriate port.
 859  *
 860  * The locking strategy used here is very "optimistic". When
 861  * someone else accesses the socket the ICMP is just dropped
 862  * and for some paths there is no check at all.
 863  * A more general error queue to queue errors for later handling
 864  * is probably better.
 865  *
 866  */
 867
 868 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 869 {
 870         struct iphdr *iph = (struct iphdr*)dp;
 871         struct tcphdr *th;
 872         struct tcp_opt *tp;
 873         int type = skb->h.icmph->type;
 874         int code = skb->h.icmph->code;
 875 #if ICMP_MIN_LENGTH < 14
 876         int no_flags = 0;
 877 #else
 878 #define no_flags 0
 879 #endif
 880         struct sock *sk;
 881         __u32 seq;
 882         int err;
 883
 884         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 885                 ICMP_INC_STATS_BH(IcmpInErrors);
 886                 return;
 887         }
 888 #if ICMP_MIN_LENGTH < 14
 889         if (len < (iph->ihl << 2) + 14)
 890                 no_flags = 1;
 891 #endif
 892
 893         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 894
 895         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
 896         if (sk == NULL) {
 897                 ICMP_INC_STATS_BH(IcmpInErrors);
 898                 return;
 899         }
 900         if (sk->state == TCP_TIME_WAIT) {
 901                 tcp_tw_put((struct tcp_tw_bucket*)sk);
 902                 return;
 903         }
 904
 905         bh_lock_sock(sk);
 906         /* If too many ICMPs get dropped on busy
 907          * servers this needs to be solved differently.
 908          */
 909         if (sk->lock.users != 0)
 910                 NET_INC_STATS_BH(LockDroppedIcmps);
 911
 912         if (sk->state == TCP_CLOSE)
 913                 goto out;
 914
 915         tp = &sk->tp_pinfo.af_tcp;
 916         seq = ntohl(th->seq);
 917         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
 918                 NET_INC_STATS(OutOfWindowIcmps);
 919                 goto out;
 920         }
 921
 922         switch (type) {
 923         case ICMP_SOURCE_QUENCH:
 924                 /* This is deprecated, but if someone generated it,
 925                  * we have no reasons to ignore it.
 926                  */
 927                 if (sk->lock.users == 0)
 928                         tcp_enter_cwr(tp);
 929                 goto out;
 930         case ICMP_PARAMETERPROB:
 931                 err = EPROTO;
 932                 break;
 933         case ICMP_DEST_UNREACH:
 934                 if (code > NR_ICMP_UNREACH)
 935                         goto out;
 936
 937                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 938                         if (sk->lock.users == 0)
 939                                 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
 940                         goto out;
 941                 }
 942
 943                 err = icmp_err_convert[code].errno;
 944                 break;
 945         case ICMP_TIME_EXCEEDED:
 946                 err = EHOSTUNREACH;
 947                 break;
 948         default:
 949                 goto out;
 950         }
 951
 952         switch (sk->state) {
 953                 struct open_request *req, **prev;
 954         case TCP_LISTEN:
 955                 if (sk->lock.users != 0)
 956                         goto out;
 957
 958                 /* The final ACK of the handshake should be already
 959                  * handled in the new socket context, not here.
 960                  * Strictly speaking - an ICMP error for the final
 961                  * ACK should set the opening flag, but that is too
 962                  * complicated right now.
 963                  */
 964                 if (!no_flags && !th->syn && !th->ack)
 965                         goto out;
 966
 967                 req = tcp_v4_search_req(tp, iph, th, &prev);
 968                 if (!req)
 969                         goto out;
 970
 971                 /* ICMPs are not backlogged, hence we cannot get
 972                    an established socket here.
 973                  */
 974                 BUG_TRAP(req->sk == NULL);
 975
 976                 if (seq != req->snt_isn) {
 977                         NET_INC_STATS_BH(OutOfWindowIcmps);
 978                         goto out;
 979                 }
 980
 981                 /*
 982                  * Still in SYN_RECV, just remove it silently.
 983                  * There is no good way to pass the error to the newly
 984                  * created socket, and POSIX does not want network
 985                  * errors returned from accept().
 986                  */
 987                 tcp_synq_drop(sk, req, prev);
 988                 goto out;
 989
 990         case TCP_SYN_SENT:
 991         case TCP_SYN_RECV:  /* Cannot happen.
 992                                It can f.e. if SYNs crossed.
 993                              */
 994                 if (!no_flags && !th->syn)
 995                         goto out;
 996                 if (sk->lock.users == 0) {
 997                         TCP_INC_STATS_BH(TcpAttemptFails);
 998                         sk->err = err;
 999
1000                         sk->error_report(sk);
1001
1002                         tcp_done(sk);
1003                 } else {
1004                         sk->err_soft = err;
1005                 }
1006                 goto out;
1007         }
1008
1009         /* If we've already connected we will keep trying
1010          * until we time out, or the user gives up.
1011          *
1012          * rfc1122 4.2.3.9 allows to consider as hard errors
1013          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1014          * but it is obsoleted by pmtu discovery).
1015          *
1016          * Note, that in modern internet, where routing is unreliable
1017          * and in each dark corner broken firewalls sit, sending random
1018          * errors ordered by their masters even this two messages finally lose
1019          * their original sense (even Linux sends invalid PORT_UNREACHs)
1020          *
1021          * Now we are in compliance with RFCs.
1022          *                                                      --ANK (980905)
1023          */
1024
1025         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1026                 sk->err = err;
1027                 sk->error_report(sk);
1028         } else  { /* Only an error on timeout */
1029                 sk->err_soft = err;
1030         }
1031
1032 out:
1033         bh_unlock_sock(sk);
1034         sock_put(sk);
1035 }
1036
1037 /* This routine computes an IPv4 TCP checksum. */
1038 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1039                        struct sk_buff *skb)
1040 {
1041         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1042                                  csum_partial((char *)th, th->doff<<2, skb->csum));
1043 }
1044
1045 /*
1046  *      This routine will send an RST to the other tcp.
1047  *
1048  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1049  *                    for reset.
1050  *      Answer: if a packet caused RST, it is not for a socket
1051  *              existing in our system, if it is matched to a socket,
1052  *              it is just duplicate segment or bug in other side's TCP.
1053  *              So that we build reply only basing on parameters
1054  *              arrived with segment.
1055  *      Exception: precedence violation. We do not implement it in any case.
1056  */
1057
1058 static void tcp_v4_send_reset(struct sk_buff *skb)
1059 {
1060         struct tcphdr *th = skb->h.th;
1061         struct tcphdr rth;
1062         struct ip_reply_arg arg;
1063
1064         /* Never send a reset in response to a reset. */
1065         if (th->rst)
1066                 return;
1067
1068         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1069                 return;
1070
1071         /* Swap the send and the receive. */
1072         memset(&rth, 0, sizeof(struct tcphdr));
1073         rth.dest = th->source;
1074         rth.source = th->dest;
1075         rth.doff = sizeof(struct tcphdr)/4;
1076         rth.rst = 1;
1077
1078         if (th->ack) {
1079                 rth.seq = th->ack_seq;
1080         } else {
1081                 rth.ack = 1;
1082                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1083                                     + skb->len - (th->doff<<2));
1084         }
1085
1086         memset(&arg, 0, sizeof arg);
1087         arg.iov[0].iov_base = (unsigned char *)&rth;
1088         arg.iov[0].iov_len  = sizeof rth;
1089         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1090                                       skb->nh.iph->saddr, /*XXX*/
1091                                       sizeof(struct tcphdr),
1092                                       IPPROTO_TCP,
1093                                       0);
1094         arg.n_iov = 1;
1095         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1096
1097         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1098
1099         TCP_INC_STATS_BH(TcpOutSegs);
1100         TCP_INC_STATS_BH(TcpOutRsts);
1101 }
1102
1103 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1104    outside socket context is ugly, certainly. What can I do?
1105  */
1106
1107 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1108 {
1109         struct tcphdr *th = skb->h.th;
1110         struct {
1111                 struct tcphdr th;
1112                 u32 tsopt[3];
1113         } rep;
1114         struct ip_reply_arg arg;
1115
1116         memset(&rep.th, 0, sizeof(struct tcphdr));
1117         memset(&arg, 0, sizeof arg);
1118
1119         arg.iov[0].iov_base = (unsigned char *)&rep;
1120         arg.iov[0].iov_len  = sizeof(rep.th);
1121         arg.n_iov = 1;
1122         if (ts) {
1123                 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1124                                                 (TCPOPT_NOP << 16) |
1125                                                 (TCPOPT_TIMESTAMP << 8) |
1126                                                 TCPOLEN_TIMESTAMP);
1127                 rep.tsopt[1] = htonl(tcp_time_stamp);
1128                 rep.tsopt[2] = htonl(ts);
1129                 arg.iov[0].iov_len = sizeof(rep);
1130         }
1131
1132         /* Swap the send and the receive. */
1133         rep.th.dest = th->source;
1134         rep.th.source = th->dest;
1135         rep.th.doff = arg.iov[0].iov_len/4;
1136         rep.th.seq = htonl(seq);
1137         rep.th.ack_seq = htonl(ack);
1138         rep.th.ack = 1;
1139         rep.th.window = htons(win);
1140
1141         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1142                                       skb->nh.iph->saddr, /*XXX*/
1143                                       arg.iov[0].iov_len,
1144                                       IPPROTO_TCP,
1145                                       0);
1146         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1147
1148         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1149
1150         TCP_INC_STATS_BH(TcpOutSegs);
1151 }
1152
1153 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1154 {
1155         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1156
1157         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1158                         tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1159
1160         tcp_tw_put(tw);
1161 }
1162
1163 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1164 {
1165         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1166                         req->ts_recent);
1167 }
1168
1169 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1170 {
1171         struct rtable *rt;
1172         struct ip_options *opt;
1173
1174         opt = req->af.v4_req.opt;
1175         if(ip_route_output(&rt, ((opt && opt->srr) ?
1176                                  opt->faddr :
1177                                  req->af.v4_req.rmt_addr),
1178                            req->af.v4_req.loc_addr,
1179                            RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1180                            sk->bound_dev_if)) {
1181                 IP_INC_STATS_BH(IpOutNoRoutes);
1182                 return NULL;
1183         }
1184         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1185                 ip_rt_put(rt);
1186                 IP_INC_STATS_BH(IpOutNoRoutes);
1187                 return NULL;
1188         }
1189         return &rt->u.dst;
1190 }
1191
1192 /*
1193  *      Send a SYN-ACK after having received an ACK.
1194  *      This still operates on a open_request only, not on a big
1195  *      socket.
1196  */
1197 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1198                               struct dst_entry *dst)
1199 {
1200         int err = -1;
1201         struct sk_buff * skb;
1202
1203         /* First, grab a route. */
1204         if (dst == NULL &&
1205             (dst = tcp_v4_route_req(sk, req)) == NULL)
1206                 goto out;
1207
1208         skb = tcp_make_synack(sk, dst, req);
1209
1210         if (skb) {
1211                 struct tcphdr *th = skb->h.th;
1212
1213                 th->check = tcp_v4_check(th, skb->len,
1214                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1215                                          csum_partial((char *)th, skb->len, skb->csum));
1216
1217                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1218                                             req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1219                 if (err == NET_XMIT_CN)
1220                         err = 0;
1221         }
1222
1223 out:
1224         dst_release(dst);
1225         return err;
1226 }
1227
1228 /*
1229  *      IPv4 open_request destructor.
1230  */
1231 static void tcp_v4_or_free(struct open_request *req)
1232 {
1233         if (req->af.v4_req.opt)
1234                 kfree(req->af.v4_req.opt);
1235 }
1236
1237 static inline void syn_flood_warning(struct sk_buff *skb)
1238 {
1239         static unsigned long warntime;
1240
1241         if (jiffies - warntime > HZ*60) {
1242                 warntime = jiffies;
1243                 printk(KERN_INFO
1244                        "possible SYN flooding on port %d. Sending cookies.\n",
1245                        ntohs(skb->h.th->dest));
1246         }
1247 }
1248
1249 /*
1250  * Save and compile IPv4 options into the open_request if needed.
1251  */
1252 static inline struct ip_options *
1253 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1254 {
1255         struct ip_options *opt = &(IPCB(skb)->opt);
1256         struct ip_options *dopt = NULL;
1257
1258         if (opt && opt->optlen) {
1259                 int opt_size = optlength(opt);
1260                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1261                 if (dopt) {
1262                         if (ip_options_echo(dopt, skb)) {
1263                                 kfree(dopt);
1264                                 dopt = NULL;
1265                         }
1266                 }
1267         }
1268         return dopt;
1269 }
1270
1271 /*
1272  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1273  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1274  * It would be better to replace it with a global counter for all sockets
1275  * but then some measure against one socket starving all other sockets
1276  * would be needed.
1277  *
1278  * It was 128 by default. Experiments with real servers show, that
1279  * it is absolutely not enough even at 100conn/sec. 256 cures most
1280  * of problems. This value is adjusted to 128 for very small machines
1281  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1282  * Further increasing requires to change hash table size.
1283  */
1284 int sysctl_max_syn_backlog = 256;
1285
1286 struct or_calltable or_ipv4 = {
1287         PF_INET,
1288         tcp_v4_send_synack,
1289         tcp_v4_or_send_ack,
1290         tcp_v4_or_free,
1291         tcp_v4_send_reset
1292 };
1293
1294 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1295 {
1296         struct tcp_opt tp;
1297         struct open_request *req;
1298         __u32 saddr = skb->nh.iph->saddr;
1299         __u32 daddr = skb->nh.iph->daddr;
1300         __u32 isn = TCP_SKB_CB(skb)->when;
1301         struct dst_entry *dst = NULL;
1302 #ifdef CONFIG_SYN_COOKIES
1303         int want_cookie = 0;
1304 #else
1305 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1306 #endif
1307
1308         /* Never answer to SYNs send to broadcast or multicast */
1309         if (((struct rtable *)skb->dst)->rt_flags &
1310             (RTCF_BROADCAST|RTCF_MULTICAST))
1311                 goto drop;
1312
1313         /* TW buckets are converted to open requests without
1314          * limitations, they conserve resources and peer is
1315          * evidently real one.
1316          */
1317         if (tcp_synq_is_full(sk) && !isn) {
1318 #ifdef CONFIG_SYN_COOKIES
1319                 if (sysctl_tcp_syncookies) {
1320                         want_cookie = 1;
1321                 } else
1322 #endif
1323                 goto drop;
1324         }
1325
1326         /* Accept backlog is full. If we have already queued enough
1327          * of warm entries in syn queue, drop request. It is better than
1328          * clogging syn queue with openreqs with exponentially increasing
1329          * timeout.
1330          */
1331         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1332                 goto drop;
1333
1334         req = tcp_openreq_alloc();
1335         if (req == NULL)
1336                 goto drop;
1337
1338         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1339         tp.mss_clamp = 536;
1340         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1341
1342         tcp_parse_options(skb, &tp);
1343
1344         if (want_cookie) {
1345                 tp.sack_ok = 0;
1346                 tp.wscale_ok = 0;
1347                 tp.snd_wscale = 0;
1348                 tp.tstamp_ok = 0;
1349                 tp.saw_tstamp = 0;
1350         }
1351
1352         if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1353                 /* Some OSes (unknown ones, but I see them on web server, which
1354                  * contains information interesting only for windows'
1355                  * users) do not send their stamp in SYN. It is easy case.
1356                  * We simply do not advertise TS support.
1357                  */
1358                 tp.saw_tstamp = 0;
1359                 tp.tstamp_ok = 0;
1360         }
1361
1362         tcp_openreq_init(req, &tp, skb);
1363
1364         req->af.v4_req.loc_addr = daddr;
1365         req->af.v4_req.rmt_addr = saddr;
1366         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1367         req->class = &or_ipv4;
1368         if (!want_cookie)
1369                 TCP_ECN_create_request(req, skb->h.th);
1370
1371         if (want_cookie) {
1372 #ifdef CONFIG_SYN_COOKIES
1373                 syn_flood_warning(skb);
1374 #endif
1375                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376         } else if (isn == 0) {
1377                 struct inet_peer *peer = NULL;
1378
1379                 /* VJ's idea. We save last timestamp seen
1380                  * from the destination in peer table, when entering
1381                  * state TIME-WAIT, and check against it before
1382                  * accepting new connection request.
1383                  *
1384                  * If "isn" is not zero, this request hit alive
1385                  * timewait bucket, so that all the necessary checks
1386                  * are made in the function processing timewait state.
1387                  */
1388                 if (tp.saw_tstamp &&
1389                     sysctl_tcp_tw_recycle &&
1390                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1391                     (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1392                     peer->v4daddr == saddr) {
1393                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1394                             (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1395                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1396                                 dst_release(dst);
1397                                 goto drop_and_free;
1398                         }
1399                 }
1400                 /* Kill the following clause, if you dislike this way. */
1401                 else if (!sysctl_tcp_syncookies &&
1402                          (sysctl_max_syn_backlog - tcp_synq_len(sk)
1403                           < (sysctl_max_syn_backlog>>2)) &&
1404                          (!peer || !peer->tcp_ts_stamp) &&
1405                          (!dst || !dst->rtt)) {
1406                         /* Without syncookies last quarter of
1407                          * backlog is filled with destinations, proven to be alive.
1408                          * It means that we continue to communicate
1409                          * to destinations, already remembered
1410                          * to the moment of synflood.
1411                          */
1412                         NETDEBUG(if (net_ratelimit()) \
1413                                 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1414                                         NIPQUAD(saddr), ntohs(skb->h.th->source)));
1415                         TCP_INC_STATS_BH(TcpAttemptFails);
1416                         dst_release(dst);
1417                         goto drop_and_free;
1418                 }
1419
1420                 isn = tcp_v4_init_sequence(sk, skb);
1421         }
1422         req->snt_isn = isn;
1423
1424         if (tcp_v4_send_synack(sk, req, dst))
1425                 goto drop_and_free;
1426
1427         if (want_cookie) {
1428                 tcp_openreq_free(req);
1429         } else {
1430                 tcp_v4_synq_add(sk, req);
1431         }
1432         return 0;
1433
1434 drop_and_free:
1435         tcp_openreq_free(req);
1436 drop:
1437         TCP_INC_STATS_BH(TcpAttemptFails);
1438         return 0;
1439 }
1440
1441
1442 /*
1443  * The three way handshake has completed - we got a valid synack -
1444  * now create the new socket.
1445  */
1446 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1447                                    struct open_request *req,
1448                                    struct dst_entry *dst)
1449 {
1450         struct tcp_opt *newtp;
1451         struct sock *newsk;
1452
1453         if (tcp_acceptq_is_full(sk))
1454                 goto exit_overflow;
1455
1456         if (dst == NULL &&
1457             (dst = tcp_v4_route_req(sk, req)) == NULL)
1458                 goto exit;
1459
1460         newsk = tcp_create_openreq_child(sk, req, skb);
1461         if (!newsk)
1462                 goto exit;
1463
1464         newsk->dst_cache = dst;
1465
1466         newtp = &(newsk->tp_pinfo.af_tcp);
1467         newsk->daddr = req->af.v4_req.rmt_addr;
1468         newsk->saddr = req->af.v4_req.loc_addr;
1469         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1470         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1471         req->af.v4_req.opt = NULL;
1472         newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1473         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1474         newtp->ext_header_len = 0;
1475         if (newsk->protinfo.af_inet.opt)
1476                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1477
1478         tcp_sync_mss(newsk, dst->pmtu);
1479         newtp->advmss = dst->advmss;
1480         tcp_initialize_rcv_mss(newsk);
1481
1482         __tcp_v4_hash(newsk);
1483         __tcp_inherit_port(sk, newsk);
1484
1485         return newsk;
1486
1487 exit_overflow:
1488         NET_INC_STATS_BH(ListenOverflows);
1489 exit:
1490         NET_INC_STATS_BH(ListenDrops);
1491         dst_release(dst);
1492         return NULL;
1493 }
1494
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1496 {
1497         struct open_request *req, **prev;
1498         struct tcphdr *th = skb->h.th;
1499         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1500         struct sock *nsk;
1501
1502         /* Find possible connection requests. */
1503         req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1504         if (req)
1505                 return tcp_check_req(sk, skb, req, prev);
1506
1507         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1508                                           th->source,
1509                                           skb->nh.iph->daddr,
1510                                           ntohs(th->dest),
1511                                           tcp_v4_iif(skb));
1512
1513         if (nsk) {
1514                 if (nsk->state != TCP_TIME_WAIT) {
1515                         bh_lock_sock(nsk);
1516                         return nsk;
1517                 }
1518                 tcp_tw_put((struct tcp_tw_bucket*)sk);
1519                 return NULL;
1520         }
1521
1522 #ifdef CONFIG_SYN_COOKIES
1523         if (!th->rst && !th->syn && th->ack)
1524                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1525 #endif
1526         return sk;
1527 }
1528
1529 static int tcp_v4_checksum_init(struct sk_buff *skb)
1530 {
1531         if (skb->ip_summed == CHECKSUM_HW) {
1532                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1533                                  skb->nh.iph->daddr,skb->csum)) {
1534                         NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1535                         return -1;
1536                 }
1537                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1538         } else {
1539                 if (skb->len <= 76) {
1540                         if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1541                                          skb->nh.iph->daddr,
1542                                          csum_partial((char *)skb->h.th, skb->len, 0)))
1543                                 return -1;
1544                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1545                 } else {
1546                         skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1547                                                   skb->nh.iph->daddr,0);
1548                 }
1549         }
1550         return 0;
1551 }
1552
1553
1554 /* The socket must have it's spinlock held when we get
1555  * here.
1556  *
1557  * We have a potential double-lock case here, so even when
1558  * doing backlog processing we use the BH locking scheme.
1559  * This is because we cannot sleep with the original spinlock
1560  * held.
1561  */
1562 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1563 {
1564 #ifdef CONFIG_FILTER
1565         struct sk_filter *filter = sk->filter;
1566         if (filter && sk_filter(skb, filter))
1567                 goto discard;
1568 #endif /* CONFIG_FILTER */
1569
1570         IP_INC_STATS_BH(IpInDelivers);
1571
1572         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1573                 TCP_CHECK_TIMER(sk);
1574                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1575                         goto reset;
1576                 TCP_CHECK_TIMER(sk);
1577                 return 0;
1578         }
1579
1580         if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1581                 goto csum_err;
1582
1583         if (sk->state == TCP_LISTEN) {
1584                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1585                 if (!nsk)
1586                         goto discard;
1587
1588                 if (nsk != sk) {
1589                         if (tcp_child_process(sk, nsk, skb))
1590                                 goto reset;
1591                         return 0;
1592                 }
1593         }
1594
1595         TCP_CHECK_TIMER(sk);
1596         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1597                 goto reset;
1598         TCP_CHECK_TIMER(sk);
1599         return 0;
1600
1601 reset:
1602         tcp_v4_send_reset(skb);
1603 discard:
1604         kfree_skb(skb);
1605         /* Be careful here. If this function gets more complicated and
1606          * gcc suffers from register pressure on the x86, sk (in %ebx)
1607          * might be destroyed here. This current version compiles correctly,
1608          * but you have been warned.
1609          */
1610         return 0;
1611
1612 csum_err:
1613         TCP_INC_STATS_BH(TcpInErrs);
1614         goto discard;
1615 }
1616
1617 /*
1618  *      From tcp_input.c
1619  */
1620
1621 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1622 {
1623         struct tcphdr *th;
1624         struct sock *sk;
1625         int ret;
1626
1627         if (skb->pkt_type!=PACKET_HOST)
1628                 goto discard_it;
1629
1630         th = skb->h.th;
1631
1632         /* Pull up the IP header. */
1633         __skb_pull(skb, skb->h.raw - skb->data);
1634
1635         /* Count it even if it's bad */
1636         TCP_INC_STATS_BH(TcpInSegs);
1637
1638         /* An explanation is required here, I think.
1639          * Packet length and doff are validated by header prediction,
1640          * provided case of th->doff==0 is elimineted.
1641          * So, we defer the checks. */
1642         if (th->doff < sizeof(struct tcphdr)/4 ||
1643             (skb->ip_summed != CHECKSUM_UNNECESSARY &&
1644              tcp_v4_checksum_init(skb) < 0))
1645                 goto bad_packet;
1646
1647         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649                                     len - th->doff*4);
1650         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651         TCP_SKB_CB(skb)->when = 0;
1652         TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1653         TCP_SKB_CB(skb)->sacked = 0;
1654         skb->used = 0;
1655
1656         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1657                              skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1658
1659         if (!sk)
1660                 goto no_tcp_socket;
1661
1662 process:
1663         if(!ipsec_sk_policy(sk,skb))
1664                 goto discard_and_relse;
1665
1666         if (sk->state == TCP_TIME_WAIT)
1667                 goto do_time_wait;
1668
1669         bh_lock_sock(sk);
1670         ret = 0;
1671         if (!sk->lock.users) {
1672                 if (!tcp_prequeue(sk, skb))
1673                         ret = tcp_v4_do_rcv(sk, skb);
1674         } else
1675                 sk_add_backlog(sk, skb);
1676         bh_unlock_sock(sk);
1677
1678         sock_put(sk);
1679
1680         return ret;
1681
1682 no_tcp_socket:
1683         if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1684 bad_packet:
1685                 TCP_INC_STATS_BH(TcpInErrs);
1686         } else {
1687                 tcp_v4_send_reset(skb);
1688         }
1689
1690 discard_it:
1691         /* Discard frame. */
1692         kfree_skb(skb);
1693         return 0;
1694
1695 discard_and_relse:
1696         sock_put(sk);
1697         goto discard_it;
1698
1699 do_time_wait:
1700         if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1701                 TCP_INC_STATS_BH(TcpInErrs);
1702                 goto discard_and_relse;
1703         }
1704         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1705                                           skb, th, skb->len)) {
1706         case TCP_TW_SYN:
1707         {
1708                 struct sock *sk2;
1709
1710                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1711                 if (sk2 != NULL) {
1712                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1713                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1714                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1715                         sk = sk2;
1716                         goto process;
1717                 }
1718                 /* Fall through to ACK */
1719         }
1720         case TCP_TW_ACK:
1721                 tcp_v4_timewait_ack(sk, skb);
1722                 break;
1723         case TCP_TW_RST:
1724                 goto no_tcp_socket;
1725         case TCP_TW_SUCCESS:
1726         }
1727         goto discard_it;
1728 }
1729
1730 /* With per-bucket locks this operation is not-atomic, so that
1731  * this version is not worse.
1732  */
1733 static void __tcp_v4_rehash(struct sock *sk)
1734 {
1735         sk->prot->unhash(sk);
1736         sk->prot->hash(sk);
1737 }
1738
1739 int tcp_v4_rebuild_header(struct sock *sk)
1740 {
1741         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1742         __u32 new_saddr;
1743         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT &&
1744                 !(sk->userlocks & SOCK_BINDADDR_LOCK);
1745
1746         if (rt == NULL) {
1747                 int err;
1748
1749                 u32 daddr = sk->daddr;
1750
1751                 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1752                         daddr = sk->protinfo.af_inet.opt->faddr;
1753
1754                 err = ip_route_output(&rt, daddr, sk->saddr,
1755                                       RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1756                                       sk->bound_dev_if);
1757                 if (err) {
1758                         sk->err_soft=-err;
1759                         sk->error_report(sk);
1760                         return -1;
1761                 }
1762                 __sk_dst_set(sk, &rt->u.dst);
1763         }
1764
1765         /* Force route checking if want_rewrite. */
1766         if (want_rewrite) {
1767                 int tmp;
1768                 struct rtable *new_rt;
1769                 __u32 old_saddr = rt->rt_src;
1770
1771                 /* Query new route using another rt buffer */
1772                 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1773                                         RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1774                                         sk->bound_dev_if);
1775
1776                 /* Only useful if different source addrs */
1777                 if (tmp == 0) {
1778                         /*
1779                          *      Only useful if different source addrs
1780                          */
1781                         if (new_rt->rt_src != old_saddr ) {
1782                                 __sk_dst_set(sk, &new_rt->u.dst);
1783                                 rt = new_rt;
1784                                 goto do_rewrite;
1785                         }
1786                         dst_release(&new_rt->u.dst);
1787                 }
1788         }
1789
1790         return 0;
1791
1792 do_rewrite:
1793         new_saddr = rt->rt_src;
1794
1795         /* Ouch!, this should not happen. */
1796         if (!sk->saddr || !sk->rcv_saddr) {
1797                 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1798                        "saddr=%08X rcv_saddr=%08X\n",
1799                        ntohl(sk->saddr),
1800                        ntohl(sk->rcv_saddr));
1801                 return -1;
1802         }
1803
1804         if (new_saddr != sk->saddr) {
1805                 if (sysctl_ip_dynaddr > 1) {
1806                         printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1807                                "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1808                                NIPQUAD(sk->saddr),
1809                                NIPQUAD(new_saddr));
1810                 }
1811
1812                 sk->saddr = new_saddr;
1813                 sk->rcv_saddr = new_saddr;
1814
1815                 /* XXX The only one ugly spot where we need to
1816                  * XXX really change the sockets identity after
1817                  * XXX it has entered the hashes. -DaveM
1818                  *
1819                  * Besides that, it does not check for connection
1820                  * uniqueness. Wait for troubles.
1821                  */
1822                 __tcp_v4_rehash(sk);
1823         }
1824
1825         return 0;
1826 }
1827
1828 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1829 {
1830         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1831
1832         sin->sin_family         = AF_INET;
1833         sin->sin_addr.s_addr    = sk->daddr;
1834         sin->sin_port           = sk->dport;
1835 }
1836
1837 /* VJ's idea. Save last timestamp seen from this destination
1838  * and hold it at least for normal timewait interval to use for duplicate
1839  * segment detection in subsequent connections, before they enter synchronized
1840  * state.
1841  */
1842
1843 int tcp_v4_remember_stamp(struct sock *sk)
1844 {
1845         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1846         struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1847         struct inet_peer *peer = NULL;
1848         int release_it = 0;
1849
1850         if (rt == NULL || rt->rt_dst != sk->daddr) {
1851                 peer = inet_getpeer(sk->daddr, 1);
1852                 release_it = 1;
1853         } else {
1854                 if (rt->peer == NULL)
1855                         rt_bind_peer(rt, 1);
1856                 peer = rt->peer;
1857         }
1858
1859         if (peer) {
1860                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1861                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1862                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1863                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
1864                         peer->tcp_ts = tp->ts_recent;
1865                 }
1866                 if (release_it)
1867                         inet_putpeer(peer);
1868                 return 1;
1869         }
1870
1871         return 0;
1872 }
1873
1874 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1875 {
1876         struct inet_peer *peer = NULL;
1877
1878         peer = inet_getpeer(tw->daddr, 1);
1879
1880         if (peer) {
1881                 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1882                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1883                      peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1884                         peer->tcp_ts_stamp = tw->ts_recent_stamp;
1885                         peer->tcp_ts = tw->ts_recent;
1886                 }
1887                 inet_putpeer(peer);
1888                 return 1;
1889         }
1890
1891         return 0;
1892 }
1893
1894 struct tcp_func ipv4_specific = {
1895         ip_queue_xmit,
1896         tcp_v4_send_check,
1897         tcp_v4_rebuild_header,
1898         tcp_v4_conn_request,
1899         tcp_v4_syn_recv_sock,
1900         tcp_v4_hash_connecting,
1901         tcp_v4_remember_stamp,
1902         sizeof(struct iphdr),
1903
1904         ip_setsockopt,
1905         ip_getsockopt,
1906         v4_addr2sockaddr,
1907         sizeof(struct sockaddr_in)
1908 };
1909
1910 /* NOTE: A lot of things set to zero explicitly by call to
1911  *       sk_alloc() so need not be done here.
1912  */
1913 static int tcp_v4_init_sock(struct sock *sk)
1914 {
1915         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1916
1917         skb_queue_head_init(&tp->out_of_order_queue);
1918         tcp_init_xmit_timers(sk);
1919         tcp_prequeue_init(tp);
1920
1921         tp->rto  = TCP_TIMEOUT_INIT;
1922         tp->mdev = TCP_TIMEOUT_INIT;
1923
1924         /* So many TCP implementations out there (incorrectly) count the
1925          * initial SYN frame in their delayed-ACK and congestion control
1926          * algorithms that we must have the following bandaid to talk
1927          * efficiently to them.  -DaveM
1928          */
1929         tp->snd_cwnd = 2;
1930
1931         /* See draft-stevens-tcpca-spec-01 for discussion of the
1932          * initialization of these values.
1933          */
1934         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1935         tp->snd_cwnd_clamp = ~0;
1936         tp->mss_cache = 536;
1937
1938         tp->reordering = sysctl_tcp_reordering;
1939
1940         sk->state = TCP_CLOSE;
1941
1942         sk->write_space = tcp_write_space;
1943
1944         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1945
1946         sk->sndbuf = sysctl_tcp_wmem[1];
1947         sk->rcvbuf = sysctl_tcp_rmem[1];
1948
1949         atomic_inc(&tcp_sockets_allocated);
1950
1951         return 0;
1952 }
1953
1954 static int tcp_v4_destroy_sock(struct sock *sk)
1955 {
1956         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1957
1958         tcp_clear_xmit_timers(sk);
1959
1960         /* Cleanup up the write buffer. */
1961         tcp_writequeue_purge(sk);
1962
1963         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1964         __skb_queue_purge(&tp->out_of_order_queue);
1965
1966         /* Clean prequeue, it must be empty really */
1967         __skb_queue_purge(&tp->ucopy.prequeue);
1968
1969         /* Clean up a referenced TCP bind bucket. */
1970         if(sk->prev != NULL)
1971                 tcp_put_port(sk);
1972
1973         atomic_dec(&tcp_sockets_allocated);
1974
1975         return 0;
1976 }
1977
1978 /* Proc filesystem TCP sock list dumping. */
1979 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1980 {
1981         int ttd = req->expires - jiffies;
1982
1983         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1984                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1985                 i,
1986                 req->af.v4_req.loc_addr,
1987                 ntohs(sk->sport),
1988                 req->af.v4_req.rmt_addr,
1989                 ntohs(req->rmt_port),
1990                 TCP_SYN_RECV,
1991                 0,0, /* could print option size, but that is af dependent. */
1992                 1,   /* timers active (only the expire timer) */
1993                 ttd,
1994                 req->retrans,
1995                 uid,
1996                 0,  /* non standard timer */
1997                 0, /* open_requests have no inode */
1998                 atomic_read(&sk->refcnt),
1999                 req
2000                 );
2001 }
2002
2003 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2004 {
2005         unsigned int dest, src;
2006         __u16 destp, srcp;
2007         int timer_active;
2008         unsigned long timer_expires;
2009         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2010
2011         dest  = sp->daddr;
2012         src   = sp->rcv_saddr;
2013         destp = ntohs(sp->dport);
2014         srcp  = ntohs(sp->sport);
2015         if (tp->pending == TCP_TIME_RETRANS) {
2016                 timer_active    = 1;
2017                 timer_expires   = tp->timeout;
2018         } else if (tp->pending == TCP_TIME_PROBE0) {
2019                 timer_active    = 4;
2020                 timer_expires   = tp->timeout;
2021         } else if (timer_pending(&sp->timer)) {
2022                 timer_active    = 2;
2023                 timer_expires   = sp->timer.expires;
2024         } else {
2025                 timer_active    = 0;
2026                 timer_expires = jiffies;
2027         }
2028
2029         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2030                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2031                 i, src, srcp, dest, destp, sp->state,
2032                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2033                 timer_active, timer_expires-jiffies,
2034                 tp->retransmits,
2035                 sock_i_uid(sp),
2036                 tp->probes_out,
2037                 sock_i_ino(sp),
2038                 atomic_read(&sp->refcnt), sp,
2039                 tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong, sp->sndbuf
2040                 );
2041 }
2042
2043 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2044 {
2045         unsigned int dest, src;
2046         __u16 destp, srcp;
2047         int ttd = tw->ttd - jiffies;
2048
2049         if (ttd < 0)
2050                 ttd = 0;
2051
2052         dest  = tw->daddr;
2053         src   = tw->rcv_saddr;
2054         destp = ntohs(tw->dport);
2055         srcp  = ntohs(tw->sport);
2056
2057         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2058                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2059                 i, src, srcp, dest, destp, tw->substate, 0, 0,
2060                 3, ttd, 0, 0, 0, 0,
2061                 atomic_read(&tw->refcnt), tw);
2062 }
2063
2064 #define TMPSZ 150
2065
2066 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2067 {
2068         int len = 0, num = 0, i;
2069         off_t begin, pos = 0;
2070         char tmpbuf[TMPSZ+1];
2071
2072         if (offset < TMPSZ)
2073                 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2074                                "  sl  local_address rem_address   st tx_queue "
2075                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2076
2077         pos = TMPSZ;
2078
2079         /* First, walk listening socket table. */
2080         tcp_listen_lock();
2081         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2082                 struct sock *sk = tcp_listening_hash[i];
2083                 struct tcp_listen_opt *lopt;
2084                 int k;
2085
2086                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2087                         struct open_request *req;
2088                         int uid;
2089                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2090
2091                         if (!TCP_INET_FAMILY(sk->family))
2092                                 goto skip_listen;
2093
2094                         pos += TMPSZ;
2095                         if (pos >= offset) {
2096                                 get_tcp_sock(sk, tmpbuf, num);
2097                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2098                                 if (len >= length) {
2099                                         tcp_listen_unlock();
2100                                         goto out_no_bh;
2101                                 }
2102                         }
2103
2104 skip_listen:
2105                         uid = sock_i_uid(sk);
2106                         read_lock_bh(&tp->syn_wait_lock);
2107                         lopt = tp->listen_opt;
2108                         if (lopt && lopt->qlen != 0) {
2109                                 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2110                                         for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2111                                                 if (!TCP_INET_FAMILY(req->class->family))
2112                                                         continue;
2113
2114                                                 pos += TMPSZ;
2115                                                 if (pos < offset)
2116                                                         continue;
2117                                                 get_openreq(sk, req, tmpbuf, num, uid);
2118                                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2119                                                 if(len >= length) {
2120                                                         read_unlock_bh(&tp->syn_wait_lock);
2121                                                         tcp_listen_unlock();
2122                                                         goto out_no_bh;
2123                                                 }
2124                                         }
2125                                 }
2126                         }
2127                         read_unlock_bh(&tp->syn_wait_lock);
2128
2129                         /* Completed requests are in normal socket hash table */
2130                 }
2131         }
2132         tcp_listen_unlock();
2133
2134         local_bh_disable();
2135
2136         /* Next, walk established hash chain. */
2137         for (i = 0; i < tcp_ehash_size; i++) {
2138                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2139                 struct sock *sk;
2140                 struct tcp_tw_bucket *tw;
2141
2142                 read_lock(&head->lock);
2143                 for(sk = head->chain; sk; sk = sk->next, num++) {
2144                         if (!TCP_INET_FAMILY(sk->family))
2145                                 continue;
2146                         pos += TMPSZ;
2147                         if (pos < offset)
2148                                 continue;
2149                         get_tcp_sock(sk, tmpbuf, num);
2150                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2151                         if(len >= length) {
2152                                 read_unlock(&head->lock);
2153                                 goto out;
2154                         }
2155                 }
2156                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2157                      tw != NULL;
2158                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2159                         if (!TCP_INET_FAMILY(tw->family))
2160                                 continue;
2161                         pos += TMPSZ;
2162                         if (pos < offset)
2163                                 continue;
2164                         get_timewait_sock(tw, tmpbuf, num);
2165                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2166                         if(len >= length) {
2167                                 read_unlock(&head->lock);
2168                                 goto out;
2169                         }
2170                 }
2171                 read_unlock(&head->lock);
2172         }
2173
2174 out:
2175         local_bh_enable();
2176 out_no_bh:
2177
2178         begin = len - (pos - offset);
2179         *start = buffer + begin;
2180         len -= begin;
2181         if(len > length)
2182                 len = length;
2183         if (len < 0)
2184                 len = 0;
2185         return len;
2186 }
2187
2188 struct proto tcp_prot = {
2189         name:           "TCP",
2190         close:          tcp_close,
2191         connect:        tcp_v4_connect,
2192         disconnect:     tcp_disconnect,
2193         accept:         tcp_accept,
2194         ioctl:          tcp_ioctl,
2195         init:           tcp_v4_init_sock,
2196         destroy:        tcp_v4_destroy_sock,
2197         shutdown:       tcp_shutdown,
2198         setsockopt:     tcp_setsockopt,
2199         getsockopt:     tcp_getsockopt,
2200         sendmsg:        tcp_sendmsg,
2201         recvmsg:        tcp_recvmsg,
2202         backlog_rcv:    tcp_v4_do_rcv,
2203         hash:           tcp_v4_hash,
2204         unhash:         tcp_unhash,
2205         get_port:       tcp_v4_get_port,
2206 };
2207
2208
2209
2210 void __init tcp_v4_init(struct net_proto_family *ops)
2211 {
2212         int err;
2213
2214         tcp_inode.i_mode = S_IFSOCK;
2215         tcp_inode.i_sock = 1;
2216         tcp_inode.i_uid = 0;
2217         tcp_inode.i_gid = 0;
2218         init_waitqueue_head(&tcp_inode.i_wait);
2219         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2220
2221         tcp_socket->inode = &tcp_inode;
2222         tcp_socket->state = SS_UNCONNECTED;
2223         tcp_socket->type=SOCK_RAW;
2224
2225         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2226                 panic("Failed to create the TCP control socket.\n");
2227         tcp_socket->sk->allocation=GFP_ATOMIC;
2228         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2229
2230         /* Unhash it so that IP input processing does not even
2231          * see it, we do not wish this socket to see incoming
2232          * packets.
2233          */
2234         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2235 }