net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.189 1999/09/07 02:31:33 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/init.h>
  55 #include <linux/ipsec.h>
  56
  57 #include <net/icmp.h>
  58 #include <net/tcp.h>
  59 #include <net/ipv6.h>
  60 #include <net/inet_common.h>
  61
  62 #include <asm/segment.h>
  63
  64 #include <linux/inet.h>
  65 #include <linux/stddef.h>
  66
  67 extern int sysctl_tcp_timestamps;
  68 extern int sysctl_tcp_window_scaling;
  69 extern int sysctl_tcp_sack;
  70 extern int sysctl_tcp_syncookies;
  71 extern int sysctl_tcp_tw_recycle;
  72 extern int sysctl_ip_dynaddr;
  73 extern __u32 sysctl_wmem_max;
  74 extern __u32 sysctl_rmem_max;
  75
  76 /* Check TCP sequence numbers in ICMP packets. */
  77 #define ICMP_MIN_LENGTH 8
  78
  79 /* Socket used for sending RSTs */
  80 struct inode tcp_inode;
  81 struct socket *tcp_socket=&tcp_inode.u.socket_i;
  82
  83 static void tcp_v4_send_reset(struct sk_buff *skb);
  84
  85 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  86                        struct sk_buff *skb);
  87
  88 /* This is for sockets with full identity only.  Sockets here will always
  89  * be without wildcards and will have the following invariant:
  90  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  91  *
  92  * First half of the table is for sockets not in TIME_WAIT, second half
  93  * is for TIME_WAIT sockets only.
  94  */
  95 struct tcp_ehash_bucket *tcp_ehash = NULL;
  96
  97 /* Ok, let's try this, I give up, we do need a local binding
  98  * TCP hash as well as the others for fast bind/connect.
  99  */
 100 struct tcp_bind_hashbucket *tcp_bhash = NULL;
 101
 102 int tcp_bhash_size = 0;
 103 int tcp_ehash_size = 0;
 104
 105 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
 106  * where wildcard'd TCP sockets can exist.  Hash function here is just local
 107  * port number.
 108  */
 109 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
 110 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
 111                                 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
 112                                   (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
 113
 114 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
 115 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
 116 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
 117
 118 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
 119
 120 /*
 121  * This array holds the first and last local port number.
 122  * For high-usage systems, use sysctl to change this to
 123  * 32768-61000
 124  */
 125 int sysctl_local_port_range[2] = { 1024, 4999 };
 126 int tcp_port_rover = (1024 - 1);
 127
 128 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 129                                  __u32 faddr, __u16 fport)
 130 {
 131         int h = ((laddr ^ lport) ^ (faddr ^ fport));
 132         h ^= h>>16;
 133         h ^= h>>8;
 134         return h & (tcp_ehash_size - 1);
 135 }
 136
 137 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 138 {
 139         __u32 laddr = sk->rcv_saddr;
 140         __u16 lport = sk->num;
 141         __u32 faddr = sk->daddr;
 142         __u16 fport = sk->dport;
 143
 144         return tcp_hashfn(laddr, lport, faddr, fport);
 145 }
 146
 147 /* Allocate and initialize a new TCP local port bind bucket.
 148  * The bindhash mutex for snum's hash chain must be held here.
 149  */
 150 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 151                                           unsigned short snum)
 152 {
 153         struct tcp_bind_bucket *tb;
 154
 155         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 156         if(tb != NULL) {
 157                 tb->port = snum;
 158                 tb->fastreuse = 0;
 159                 tb->owners = NULL;
 160                 if((tb->next = head->chain) != NULL)
 161                         tb->next->pprev = &tb->next;
 162                 head->chain = tb;
 163                 tb->pprev = &head->chain;
 164         }
 165         return tb;
 166 }
 167
 168 /* Caller must disable local BH processing. */
 169 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 170 {
 171         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
 172         struct tcp_bind_bucket *tb;
 173
 174         spin_lock(&head->lock);
 175         tb = (struct tcp_bind_bucket *)sk->prev;
 176         if ((child->bind_next = tb->owners) != NULL)
 177                 tb->owners->bind_pprev = &child->bind_next;
 178         tb->owners = child;
 179         child->bind_pprev = &tb->owners;
 180         child->prev = (struct sock *) tb;
 181         spin_unlock(&head->lock);
 182 }
 183
 184 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
 185 {
 186         local_bh_disable();
 187         __tcp_inherit_port(sk, child);
 188         local_bh_enable();
 189 }
 190
 191 /* Obtain a reference to a local port for the given sock,
 192  * if snum is zero it means select any available local port.
 193  */
 194 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 195 {
 196         struct tcp_bind_hashbucket *head;
 197         struct tcp_bind_bucket *tb;
 198         int ret;
 199
 200         local_bh_disable();
 201         if (snum == 0) {
 202                 int low = sysctl_local_port_range[0];
 203                 int high = sysctl_local_port_range[1];
 204                 int remaining = (high - low) + 1;
 205                 int rover;
 206
 207                 spin_lock(&tcp_portalloc_lock);
 208                 rover = tcp_port_rover;
 209                 do {    rover++;
 210                         if ((rover < low) || (rover > high))
 211                                 rover = low;
 212                         head = &tcp_bhash[tcp_bhashfn(rover)];
 213                         spin_lock(&head->lock);
 214                         for (tb = head->chain; tb; tb = tb->next)
 215                                 if (tb->port == rover)
 216                                         goto next;
 217                         break;
 218                 next:
 219                         spin_unlock(&head->lock);
 220                 } while (--remaining > 0);
 221                 tcp_port_rover = rover;
 222                 spin_unlock(&tcp_portalloc_lock);
 223
 224                 /* Exhausted local port range during search? */
 225                 ret = 1;
 226                 if (remaining <= 0)
 227                         goto fail;
 228
 229                 /* OK, here is the one we will use.  HEAD is
 230                  * non-NULL and we hold it's mutex.
 231                  */
 232                 snum = rover;
 233                 tb = NULL;
 234         } else {
 235                 head = &tcp_bhash[tcp_bhashfn(snum)];
 236                 spin_lock(&head->lock);
 237                 for (tb = head->chain; tb != NULL; tb = tb->next)
 238                         if (tb->port == snum)
 239                                 break;
 240         }
 241         if (tb != NULL && tb->owners != NULL) {
 242                 if (tb->fastreuse != 0 && sk->reuse != 0) {
 243                         goto success;
 244                 } else {
 245                         struct sock *sk2 = tb->owners;
 246                         int sk_reuse = sk->reuse;
 247
 248                         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
 249                                 if (sk->bound_dev_if == sk2->bound_dev_if) {
 250                                         if (!sk_reuse   ||
 251                                             !sk2->reuse ||
 252                                             sk2->state == TCP_LISTEN) {
 253                                                 if (!sk2->rcv_saddr     ||
 254                                                     !sk->rcv_saddr      ||
 255                                                     (sk2->rcv_saddr == sk->rcv_saddr))
 256                                                         break;
 257                                         }
 258                                 }
 259                         }
 260                         /* If we found a conflict, fail. */
 261                         ret = 1;
 262                         if (sk2 != NULL)
 263                                 goto fail_unlock;
 264                 }
 265         }
 266         ret = 1;
 267         if (tb == NULL &&
 268             (tb = tcp_bucket_create(head, snum)) == NULL)
 269                         goto fail_unlock;
 270         if (tb->owners == NULL) {
 271                 if (sk->reuse && sk->state != TCP_LISTEN)
 272                         tb->fastreuse = 1;
 273                 else
 274                         tb->fastreuse = 0;
 275         } else if (tb->fastreuse &&
 276                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
 277                 tb->fastreuse = 0;
 278 success:
 279         sk->num = snum;
 280         if ((sk->bind_next = tb->owners) != NULL)
 281                 tb->owners->bind_pprev = &sk->bind_next;
 282         tb->owners = sk;
 283         sk->bind_pprev = &tb->owners;
 284         sk->prev = (struct sock *) tb;
 285         ret = 0;
 286
 287 fail_unlock:
 288         spin_unlock(&head->lock);
 289 fail:
 290         local_bh_enable();
 291         return ret;
 292 }
 293
 294 /* Get rid of any references to a local port held by the
 295  * given sock.
 296  */
 297 __inline__ void __tcp_put_port(struct sock *sk)
 298 {
 299         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
 300         struct tcp_bind_bucket *tb;
 301
 302         spin_lock(&head->lock);
 303         tb = (struct tcp_bind_bucket *) sk->prev;
 304         if (sk->bind_next)
 305                 sk->bind_next->bind_pprev = sk->bind_pprev;
 306         *(sk->bind_pprev) = sk->bind_next;
 307         sk->prev = NULL;
 308         if (tb->owners == NULL) {
 309                 if (tb->next)
 310                         tb->next->pprev = tb->pprev;
 311                 *(tb->pprev) = tb->next;
 312                 kmem_cache_free(tcp_bucket_cachep, tb);
 313         }
 314         spin_unlock(&head->lock);
 315 }
 316
 317 void tcp_put_port(struct sock *sk)
 318 {
 319         local_bh_disable();
 320         __tcp_put_port(sk);
 321         local_bh_enable();
 322 }
 323
 324 #ifdef CONFIG_TCP_TW_RECYCLE
 325 /*
 326    Very stupid pseudo-"algoritm". If the approach will be successful
 327    (and it will!), we have to make it more reasonable.
 328    Now it eats lots of CPU, when we are tough on ports.
 329
 330    Apparently, it should be hash table indexed by daddr/dport.
 331
 332    How does it work? We allow to truncate time-wait state, if:
 333    1. PAWS works on it.
 334    2. timewait bucket did not receive data for timeout:
 335       - initially timeout := 2*RTO, so that if our ACK to first
 336         transmitted peer's FIN is lost, we will see first retransmit.
 337       - if we receive anything, the timout is increased exponentially
 338         to follow normal TCP backoff pattern.
 339       It is important that minimal RTO (HZ/5) > minimal timestamp
 340       step (1ms).
 341    3. When creating new socket, we inherit sequence number
 342       and ts_recent of time-wait bucket, increasinf them a bit.
 343
 344    These two conditions guarantee, that data will not be corrupted
 345    both by retransmitted and by delayed segments. They do not guarantee
 346    that peer will leave LAST-ACK/CLOSING state gracefully, it will be
 347    reset sometimes, namely, when more than two our ACKs to its FINs are lost.
 348    This reset is harmless and even good.
 349  */
 350
 351 int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
 352 {
 353         static int tw_rover;
 354
 355         struct tcp_tw_bucket *tw;
 356         struct tcp_bind_hashbucket *head;
 357         struct tcp_bind_bucket *tb;
 358
 359         int low = sysctl_local_port_range[0];
 360         int high = sysctl_local_port_range[1];
 361         unsigned long now = jiffies;
 362         int i, rover;
 363
 364         rover = tw_rover;
 365
 366         local_bh_disable();
 367         for (i=0; i<tcp_bhash_size; i++, rover++) {
 368                 rover &= (tcp_bhash_size-1);
 369                 head = &tcp_bhash[rover];
 370
 371                 spin_lock(&head->lock);
 372                 for (tb = head->chain; tb; tb = tb->next) {
 373                         tw = (struct tcp_tw_bucket*)tb->owners;
 374
 375                         if (tw->state != TCP_TIME_WAIT ||
 376                             tw->dport != dport ||
 377                             tw->daddr != daddr ||
 378                             tw->rcv_saddr != sk->rcv_saddr ||
 379                             tb->port < low ||
 380                             tb->port >= high ||
 381                             !TCP_INET_FAMILY(tw->family) ||
 382                             tw->ts_recent_stamp == 0 ||
 383                             (long)(now - tw->ttd) <= 0)
 384                                 continue;
 385                         tw_rover = rover;
 386                         goto hit;
 387                 }
 388                 spin_unlock(&head->lock);
 389         }
 390         local_bh_enable();
 391         tw_rover = rover;
 392         return -EAGAIN;
 393
 394 hit:
 395         sk->num = tw->num;
 396         if ((sk->bind_next = tb->owners) != NULL)
 397                 tb->owners->bind_pprev = &sk->bind_next;
 398         tb->owners = sk;
 399         sk->bind_pprev = &tb->owners;
 400         sk->prev = (struct sock *) tb;
 401         spin_unlock_bh(&head->lock);
 402         return 0;
 403 }
 404 #endif
 405
 406
 407 void tcp_listen_wlock(void)
 408 {
 409         write_lock(&tcp_lhash_lock);
 410
 411         if (atomic_read(&tcp_lhash_users)) {
 412                 DECLARE_WAITQUEUE(wait, current);
 413
 414                 add_wait_queue(&tcp_lhash_wait, &wait);
 415                 for (;;) {
 416                         set_current_state(TASK_UNINTERRUPTIBLE);
 417                         if (atomic_read(&tcp_lhash_users) == 0)
 418                                 break;
 419                         write_unlock_bh(&tcp_lhash_lock);
 420                         schedule();
 421                         write_lock_bh(&tcp_lhash_lock);
 422                 }
 423
 424                 __set_current_state(TASK_RUNNING);
 425                 remove_wait_queue(&tcp_lhash_wait, &wait);
 426         }
 427 }
 428
 429 static __inline__ void __tcp_v4_hash(struct sock *sk)
 430 {
 431         struct sock **skp;
 432         rwlock_t *lock;
 433
 434         BUG_TRAP(sk->pprev==NULL);
 435         if(sk->state == TCP_LISTEN) {
 436                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 437                 lock = &tcp_lhash_lock;
 438                 tcp_listen_wlock();
 439         } else {
 440                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
 441                 lock = &tcp_ehash[sk->hashent].lock;
 442                 write_lock(lock);
 443         }
 444         if((sk->next = *skp) != NULL)
 445                 (*skp)->pprev = &sk->next;
 446         *skp = sk;
 447         sk->pprev = skp;
 448         sk->prot->inuse++;
 449         if(sk->prot->highestinuse < sk->prot->inuse)
 450                 sk->prot->highestinuse = sk->prot->inuse;
 451         write_unlock(lock);
 452 }
 453
 454 static void tcp_v4_hash(struct sock *sk)
 455 {
 456         if (sk->state != TCP_CLOSE) {
 457                 local_bh_disable();
 458                 __tcp_v4_hash(sk);
 459                 local_bh_enable();
 460         }
 461 }
 462
 463 void tcp_unhash(struct sock *sk)
 464 {
 465         rwlock_t *lock;
 466
 467         if (sk->state == TCP_LISTEN) {
 468                 local_bh_disable();
 469                 tcp_listen_wlock();
 470                 lock = &tcp_lhash_lock;
 471         } else {
 472                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
 473                 lock = &head->lock;
 474                 write_lock_bh(&head->lock);
 475         }
 476
 477         if(sk->pprev) {
 478                 if(sk->next)
 479                         sk->next->pprev = sk->pprev;
 480                 *sk->pprev = sk->next;
 481                 sk->pprev = NULL;
 482                 sk->prot->inuse--;
 483         }
 484         write_unlock_bh(lock);
 485 }
 486
 487 /* Don't inline this cruft.  Here are some nice properties to
 488  * exploit here.  The BSD API does not allow a listening TCP
 489  * to specify the remote port nor the remote address for the
 490  * connection.  So always assume those are both wildcarded
 491  * during the search since they can never be otherwise.
 492  */
 493 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
 494 {
 495         struct sock *result = NULL;
 496         int score, hiscore;
 497
 498         hiscore=0;
 499         for(; sk; sk = sk->next) {
 500                 if(sk->num == hnum) {
 501                         __u32 rcv_saddr = sk->rcv_saddr;
 502
 503                         score = 1;
 504                         if(rcv_saddr) {
 505                                 if (rcv_saddr != daddr)
 506                                         continue;
 507                                 score++;
 508                         }
 509                         if (sk->bound_dev_if) {
 510                                 if (sk->bound_dev_if != dif)
 511                                         continue;
 512                                 score++;
 513                         }
 514                         if (score == 3)
 515                                 return sk;
 516                         if (score > hiscore) {
 517                                 hiscore = score;
 518                                 result = sk;
 519                         }
 520                 }
 521         }
 522         return result;
 523 }
 524
 525 /* Optimize the common listener case. */
 526 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 527 {
 528         struct sock *sk;
 529
 530         read_lock(&tcp_lhash_lock);
 531         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
 532         if (sk) {
 533                 if (sk->num == hnum && sk->next == NULL)
 534                         goto sherry_cache;
 535                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
 536         }
 537         if (sk) {
 538 sherry_cache:
 539                 sock_hold(sk);
 540         }
 541         read_unlock(&tcp_lhash_lock);
 542         return sk;
 543 }
 544
 545 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 546  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 547  *
 548  * Local BH must be disabled here.
 549  */
 550 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 551                                            u32 daddr, u16 hnum, int dif)
 552 {
 553         struct tcp_ehash_bucket *head;
 554         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 555         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 556         struct sock *sk;
 557         int hash;
 558
 559         /* Optimize here for direct hit, only listening connections can
 560          * have wildcards anyways.
 561          */
 562         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 563         head = &tcp_ehash[hash];
 564         read_lock(&head->lock);
 565         for(sk = head->chain; sk; sk = sk->next) {
 566                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 567                         goto hit; /* You sunk my battleship! */
 568         }
 569
 570         /* Must check for a TIME_WAIT'er before going to listener hash. */
 571         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
 572                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 573                         goto hit;
 574         read_unlock(&head->lock);
 575
 576         return tcp_v4_lookup_listener(daddr, hnum, dif);
 577
 578 hit:
 579         sock_hold(sk);
 580         read_unlock(&head->lock);
 581         return sk;
 582 }
 583
 584 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 585 {
 586         struct sock *sk;
 587
 588         local_bh_disable();
 589         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 590         local_bh_enable();
 591
 592         return sk;
 593 }
 594
 595 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 596 {
 597         return secure_tcp_sequence_number(sk->saddr, sk->daddr,
 598                                           skb->h.th->dest,
 599                                           skb->h.th->source);
 600 }
 601
 602 static int tcp_v4_check_established(struct sock *sk)
 603 {
 604         u32 daddr = sk->rcv_saddr;
 605         u32 saddr = sk->daddr;
 606         int dif = sk->bound_dev_if;
 607         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 608         __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
 609         int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 610         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 611         struct sock *sk2, **skp;
 612 #ifdef CONFIG_TCP_TW_RECYCLE
 613         struct tcp_tw_bucket *tw;
 614 #endif
 615
 616         write_lock_bh(&head->lock);
 617
 618         /* Check TIME-WAIT sockets first. */
 619         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 620             skp = &sk2->next) {
 621 #ifdef CONFIG_TCP_TW_RECYCLE
 622                 tw = (struct tcp_tw_bucket*)sk2;
 623 #endif
 624
 625                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 626 #ifdef CONFIG_TCP_TW_RECYCLE
 627                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 628
 629                         /* With PAWS, it is safe from the viewpoint
 630                            of data integrity. Even without PAWS it
 631                            is safe provided sequence spaces do not
 632                            overlap i.e. at data rates <= 80Mbit/sec.
 633
 634                            Actually, the idea is close to VJ's (rfc1332)
 635                            one, only timestamp cache is held not per host,
 636                            but per port pair and TW bucket is used
 637                            as state holder.
 638                          */
 639                         if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
 640                                 if ((tp->write_seq = tw->snd_nxt + 2) == 0)
 641                                         tp->write_seq = 1;
 642                                 tp->ts_recent = tw->ts_recent;
 643                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
 644                                 sock_hold(sk2);
 645                                 skp = &head->chain;
 646                                 goto unique;
 647                         } else
 648 #endif
 649                         goto not_unique;
 650                 }
 651         }
 652 #ifdef CONFIG_TCP_TW_RECYCLE
 653         tw = NULL;
 654 #endif
 655
 656         /* And established part... */
 657         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
 658                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 659                         goto not_unique;
 660         }
 661
 662 #ifdef CONFIG_TCP_TW_RECYCLE
 663 unique:
 664 #endif
 665         BUG_TRAP(sk->pprev==NULL);
 666         if ((sk->next = *skp) != NULL)
 667                 (*skp)->pprev = &sk->next;
 668
 669         *skp = sk;
 670         sk->pprev = skp;
 671         sk->prot->inuse++;
 672         if(sk->prot->highestinuse < sk->prot->inuse)
 673                 sk->prot->highestinuse = sk->prot->inuse;
 674         write_unlock_bh(&head->lock);
 675
 676 #ifdef CONFIG_TCP_TW_RECYCLE
 677         if (tw) {
 678                 /* Silly. Should hash-dance instead... */
 679                 local_bh_disable();
 680                 tcp_tw_deschedule(tw);
 681                 tcp_timewait_kill(tw);
 682                 local_bh_enable();
 683
 684                 tcp_tw_put(tw);
 685         }
 686 #endif
 687         return 0;
 688
 689 not_unique:
 690         write_unlock_bh(&head->lock);
 691         return -EADDRNOTAVAIL;
 692 }
 693
 694 /* Hash SYN-SENT socket to established hash table after
 695  * checking that it is unique. Note, that without kernel lock
 696  * we MUST make these two operations atomically.
 697  *
 698  * Optimization: if it is bound and tcp_bind_bucket has the only
 699  * owner (us), we need not to scan established bucket.
 700  */
 701
 702 int tcp_v4_hash_connecting(struct sock *sk)
 703 {
 704         unsigned short snum = sk->num;
 705         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
 706         struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
 707
 708         spin_lock_bh(&head->lock);
 709         if (tb->owners == sk && sk->bind_next == NULL) {
 710                 __tcp_v4_hash(sk);
 711                 spin_unlock_bh(&head->lock);
 712                 return 0;
 713         } else {
 714                 spin_unlock_bh(&head->lock);
 715
 716                 /* No definite answer... Walk to established hash table */
 717                 return tcp_v4_check_established(sk);
 718         }
 719 }
 720
 721 /* This will initiate an outgoing connection. */
 722 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 723 {
 724         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 725         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 726         struct sk_buff *buff;
 727         struct rtable *rt;
 728         u32 daddr, nexthop;
 729         int tmp;
 730         int err;
 731
 732         if (sk->state != TCP_CLOSE)
 733                 return(-EISCONN);
 734
 735         if (addr_len < sizeof(struct sockaddr_in))
 736                 return(-EINVAL);
 737
 738         if (usin->sin_family != AF_INET)
 739                 return(-EAFNOSUPPORT);
 740
 741         nexthop = daddr = usin->sin_addr.s_addr;
 742         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
 743                 if (daddr == 0)
 744                         return -EINVAL;
 745                 nexthop = sk->protinfo.af_inet.opt->faddr;
 746         }
 747
 748         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 749                                RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 750         if (tmp < 0)
 751                 return tmp;
 752
 753         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 754                 ip_rt_put(rt);
 755                 return -ENETUNREACH;
 756         }
 757
 758         __sk_dst_set(sk, &rt->u.dst);
 759
 760         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
 761                 daddr = rt->rt_dst;
 762
 763         err = -ENOBUFS;
 764         buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 765                             0, GFP_KERNEL);
 766
 767         if (buff == NULL)
 768                 goto failure;
 769
 770         if (!sk->saddr)
 771                 sk->saddr = rt->rt_src;
 772         sk->rcv_saddr = sk->saddr;
 773
 774         if (!sk->num) {
 775                 if (sk->prot->get_port(sk, 0)
 776 #ifdef CONFIG_TCP_TW_RECYCLE
 777                     && (!sysctl_tcp_tw_recycle ||
 778                         tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
 779 #endif
 780                     ) {
 781                         kfree_skb(buff);
 782                         err = -EAGAIN;
 783                         goto failure;
 784                 }
 785                 sk->sport = htons(sk->num);
 786         }
 787 #ifdef CONFIG_TCP_TW_RECYCLE
 788         else if (tp->ts_recent_stamp && sk->daddr != daddr) {
 789                 /* Reset inherited state */
 790                 tp->ts_recent = 0;
 791                 tp->ts_recent_stamp = 0;
 792                 tp->write_seq = 0;
 793         }
 794 #endif
 795
 796         sk->dport = usin->sin_port;
 797         sk->daddr = daddr;
 798
 799         if (!tp->write_seq)
 800                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 801                                                            sk->sport, usin->sin_port);
 802
 803         tp->ext_header_len = 0;
 804         if (sk->protinfo.af_inet.opt)
 805                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
 806
 807         tp->mss_clamp = 536;
 808
 809         err = tcp_connect(sk, buff);
 810         if (err == 0)
 811                 return 0;
 812
 813 failure:
 814         __sk_dst_reset(sk);
 815         sk->dport = 0;
 816         return err;
 817 }
 818
 819 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 820 {
 821         int retval = -EINVAL;
 822
 823         lock_sock(sk);
 824
 825         /* Do sanity checking for sendmsg/sendto/send. */
 826         if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
 827                 goto out;
 828         if (msg->msg_name) {
 829                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
 830
 831                 if (msg->msg_namelen < sizeof(*addr))
 832                         goto out;
 833                 if (addr->sin_family && addr->sin_family != AF_INET)
 834                         goto out;
 835                 retval = -ENOTCONN;
 836                 if(sk->state == TCP_CLOSE)
 837                         goto out;
 838                 retval = -EISCONN;
 839                 if (addr->sin_port != sk->dport)
 840                         goto out;
 841                 if (addr->sin_addr.s_addr != sk->daddr)
 842                         goto out;
 843         }
 844         retval = tcp_do_sendmsg(sk, msg);
 845
 846 out:
 847         release_sock(sk);
 848         return retval;
 849 }
 850
 851
 852 /*
 853  * Do a linear search in the socket open_request list.
 854  * This should be replaced with a global hash table.
 855  */
 856 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 857                                               struct iphdr *iph,
 858                                               struct tcphdr *th,
 859                                               struct open_request **prevp)
 860 {
 861         struct open_request *req, *prev;
 862         __u16 rport = th->source;
 863
 864         /*      assumption: the socket is not in use.
 865          *      as we checked the user count on tcp_rcv and we're
 866          *      running from a soft interrupt.
 867          */
 868         prev = (struct open_request *) (&tp->syn_wait_queue);
 869         for (req = prev->dl_next; req; req = req->dl_next) {
 870                 if (req->af.v4_req.rmt_addr == iph->saddr &&
 871                     req->af.v4_req.loc_addr == iph->daddr &&
 872                     req->rmt_port == rport &&
 873                     TCP_INET_FAMILY(req->class->family)) {
 874                         if (req->sk) {
 875                                 /* Weird case: connection was established
 876                                    and then killed by RST before user accepted
 877                                    it. This connection is dead, but we cannot
 878                                    kill openreq to avoid blocking in accept().
 879
 880                                    accept() will collect this garbage,
 881                                    but such reqs must be ignored, when talking
 882                                    to network.
 883                                  */
 884                                 bh_lock_sock(req->sk);
 885                                 BUG_TRAP(req->sk->lock.users==0);
 886                                 if (req->sk->state == TCP_CLOSE) {
 887                                         bh_unlock_sock(req->sk);
 888                                         prev = req;
 889                                         continue;
 890                                 }
 891                         }
 892                         *prevp = prev;
 893                         return req;
 894                 }
 895                 prev = req;
 896         }
 897         return NULL;
 898 }
 899
 900
 901 /*
 902  * This routine does path mtu discovery as defined in RFC1191.
 903  */
 904 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
 905 {
 906         struct dst_entry *dst;
 907         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 908
 909         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 910          * send out by Linux are always <576bytes so they should go through
 911          * unfragmented).
 912          */
 913         if (sk->state == TCP_LISTEN)
 914                 return;
 915
 916         /* We don't check in the destentry if pmtu discovery is forbidden
 917          * on this route. We just assume that no packet_to_big packets
 918          * are send back when pmtu discovery is not active.
 919          * There is a small race when the user changes this flag in the
 920          * route, but I think that's acceptable.
 921          */
 922         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 923                 return;
 924
 925         ip_rt_update_pmtu(dst, mtu);
 926
 927         /* Something is about to be wrong... Remember soft error
 928          * for the case, if this connection will not able to recover.
 929          */
 930         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
 931                 sk->err_soft = EMSGSIZE;
 932
 933         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
 934             tp->pmtu_cookie > dst->pmtu) {
 935                 tcp_sync_mss(sk, dst->pmtu);
 936
 937                 /* Resend the TCP packet because it's
 938                  * clear that the old packet has been
 939                  * dropped. This is the new "fast" path mtu
 940                  * discovery.
 941                  */
 942                 tcp_simple_retransmit(sk);
 943         } /* else let the usual retransmit timer handle it */
 944 }
 945
 946 /*
 947  * This routine is called by the ICMP module when it gets some
 948  * sort of error condition.  If err < 0 then the socket should
 949  * be closed and the error returned to the user.  If err > 0
 950  * it's just the icmp type << 8 | icmp code.  After adjustment
 951  * header points to the first 8 bytes of the tcp header.  We need
 952  * to find the appropriate port.
 953  *
 954  * The locking strategy used here is very "optimistic". When
 955  * someone else accesses the socket the ICMP is just dropped
 956  * and for some paths there is no check at all.
 957  * A more general error queue to queue errors for later handling
 958  * is probably better.
 959  *
 960  */
 961
 962 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 963 {
 964         struct iphdr *iph = (struct iphdr*)dp;
 965         struct tcphdr *th;
 966         struct tcp_opt *tp;
 967         int type = skb->h.icmph->type;
 968         int code = skb->h.icmph->code;
 969 #if ICMP_MIN_LENGTH < 14
 970         int no_flags = 0;
 971 #else
 972 #define no_flags 0
 973 #endif
 974         struct sock *sk;
 975         __u32 seq;
 976         int err;
 977
 978         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 979                 icmp_statistics.IcmpInErrors++;
 980                 return;
 981         }
 982 #if ICMP_MIN_LENGTH < 14
 983         if (len < (iph->ihl << 2) + 14)
 984                 no_flags = 1;
 985 #endif
 986
 987         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 988
 989         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
 990         if (sk == NULL) {
 991                 icmp_statistics.IcmpInErrors++;
 992                 return;
 993         }
 994         if (sk->state == TCP_TIME_WAIT) {
 995                 tcp_tw_put((struct tcp_tw_bucket*)sk);
 996                 return;
 997         }
 998
 999         bh_lock_sock(sk);
1000         /* If too many ICMPs get dropped on busy
1001          * servers this needs to be solved differently.
1002          */
1003         if (sk->lock.users != 0)
1004                 net_statistics.LockDroppedIcmps++;
1005
1006         tp = &sk->tp_pinfo.af_tcp;
1007         seq = ntohl(th->seq);
1008         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1009                 net_statistics.OutOfWindowIcmps++;
1010                 goto out;
1011         }
1012
1013         switch (type) {
1014         case ICMP_SOURCE_QUENCH:
1015 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
1016                 if (sk->lock.users == 0) {
1017                         tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
1018                         tp->snd_cwnd = tp->snd_ssthresh;
1019                         tp->snd_cwnd_cnt = 0;
1020                         tp->high_seq = tp->snd_nxt;
1021                 }
1022 #endif
1023                 goto out;
1024         case ICMP_PARAMETERPROB:
1025                 err = EPROTO;
1026                 break;
1027         case ICMP_DEST_UNREACH:
1028                 if (code > NR_ICMP_UNREACH)
1029                         goto out;
1030
1031                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1032                         if (sk->lock.users == 0)
1033                                 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
1034                         goto out;
1035                 }
1036
1037                 err = icmp_err_convert[code].errno;
1038                 break;
1039         case ICMP_TIME_EXCEEDED:
1040                 err = EHOSTUNREACH;
1041                 break;
1042         default:
1043                 goto out;
1044         }
1045
1046         switch (sk->state) {
1047                 struct open_request *req, *prev;
1048         case TCP_LISTEN:
1049                 if (sk->lock.users != 0)
1050                         goto out;
1051
1052                 /* The final ACK of the handshake should be already
1053                  * handled in the new socket context, not here.
1054                  * Strictly speaking - an ICMP error for the final
1055                  * ACK should set the opening flag, but that is too
1056                  * complicated right now.
1057                  */
1058                 if (!no_flags && !th->syn && !th->ack)
1059                         goto out;
1060
1061                 req = tcp_v4_search_req(tp, iph, th, &prev);
1062                 if (!req)
1063                         goto out;
1064
1065                 if (req->sk) {
1066                         struct sock *nsk = req->sk;
1067
1068                         /*
1069                          * Already in ESTABLISHED and a big socket is created,
1070                          * set error code there.
1071                          * The error will _not_ be reported in the accept(),
1072                          * but only with the next operation on the socket after
1073                          * accept.
1074                          */
1075                         sock_hold(nsk);
1076                         bh_unlock_sock(sk);
1077                         sock_put(sk);
1078                         sk = nsk;
1079
1080                         BUG_TRAP(sk->lock.users == 0);
1081                         tp = &sk->tp_pinfo.af_tcp;
1082                         if (!between(seq, tp->snd_una, tp->snd_nxt)) {
1083                                 net_statistics.OutOfWindowIcmps++;
1084                                 goto out;
1085                         }
1086                 } else {
1087                         if (seq != req->snt_isn) {
1088                                 net_statistics.OutOfWindowIcmps++;
1089                                 goto out;
1090                         }
1091
1092                         /*
1093                          * Still in SYN_RECV, just remove it silently.
1094                          * There is no good way to pass the error to the newly
1095                          * created socket, and POSIX does not want network
1096                          * errors returned from accept().
1097                          */
1098                         tp->syn_backlog--;
1099                         tcp_synq_unlink(tp, req, prev);
1100                         tcp_dec_slow_timer(TCP_SLT_SYNACK);
1101                         req->class->destructor(req);
1102                         tcp_openreq_free(req);
1103                         goto out;
1104                 }
1105                 break;
1106         case TCP_SYN_SENT:
1107         case TCP_SYN_RECV:  /* Cannot happen.
1108                                It can f.e. if SYNs crossed.
1109                              */
1110                 if (!no_flags && !th->syn)
1111                         goto out;
1112                 if (sk->lock.users == 0) {
1113                         tcp_statistics.TcpAttemptFails++;
1114                         sk->err = err;
1115                         /* Wake people up to see the error (see connect in sock.c) */
1116                         sk->error_report(sk);
1117
1118                         tcp_set_state(sk, TCP_CLOSE);
1119                         tcp_done(sk);
1120                 } else {
1121                         sk->err_soft = err;
1122                 }
1123                 goto out;
1124         }
1125
1126         /* If we've already connected we will keep trying
1127          * until we time out, or the user gives up.
1128          *
1129          * rfc1122 4.2.3.9 allows to consider as hard errors
1130          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1131          * but it is obsoleted by pmtu discovery).
1132          *
1133          * Note, that in modern internet, where routing is unreliable
1134          * and in each dark corner broken firewalls sit, sending random
1135          * errors ordered by their masters even this two messages finally lose
1136          * their original sense (even Linux sends invalid PORT_UNREACHs)
1137          *
1138          * Now we are in compliance with RFCs.
1139          *                                                      --ANK (980905)
1140          */
1141
1142         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1143                 sk->err = err;
1144                 sk->error_report(sk);
1145         } else  { /* Only an error on timeout */
1146                 sk->err_soft = err;
1147         }
1148
1149 out:
1150         bh_unlock_sock(sk);
1151         sock_put(sk);
1152 }
1153
1154 /* This routine computes an IPv4 TCP checksum. */
1155 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1156                        struct sk_buff *skb)
1157 {
1158         th->check = 0;
1159         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1160                                  csum_partial((char *)th, th->doff<<2, skb->csum));
1161 }
1162
1163 /*
1164  *      This routine will send an RST to the other tcp.
1165  *
1166  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1167  *                    for reset.
1168  *      Answer: if a packet caused RST, it is not for a socket
1169  *              existing in our system, if it is matched to a socket,
1170  *              it is just duplicate segment or bug in other side's TCP.
1171  *              So that we build reply only basing on parameters
1172  *              arrived with segment.
1173  *      Exception: precedence violation. We do not implement it in any case.
1174  */
1175
1176 static void tcp_v4_send_reset(struct sk_buff *skb)
1177 {
1178         struct tcphdr *th = skb->h.th;
1179         struct tcphdr rth;
1180         struct ip_reply_arg arg;
1181
1182         /* Never send a reset in response to a reset. */
1183         if (th->rst)
1184                 return;
1185
1186         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1187                 return;
1188
1189         /* Swap the send and the receive. */
1190         memset(&rth, 0, sizeof(struct tcphdr));
1191         rth.dest = th->source;
1192         rth.source = th->dest;
1193         rth.doff = sizeof(struct tcphdr)/4;
1194         rth.rst = 1;
1195
1196         if (th->ack) {
1197                 rth.seq = th->ack_seq;
1198         } else {
1199                 rth.ack = 1;
1200                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1201                                     + skb->len - (th->doff<<2));
1202         }
1203
1204         memset(&arg, 0, sizeof arg);
1205         arg.iov[0].iov_base = (unsigned char *)&rth;
1206         arg.iov[0].iov_len  = sizeof rth;
1207         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1208                                       skb->nh.iph->saddr, /*XXX*/
1209                                       sizeof(struct tcphdr),
1210                                       IPPROTO_TCP,
1211                                       0);
1212         arg.n_iov = 1;
1213         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1214
1215         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1216
1217         tcp_statistics.TcpOutSegs++;
1218         tcp_statistics.TcpOutRsts++;
1219 }
1220
1221 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1222    outside socket context is ugly, certainly. What can I do?
1223  */
1224
1225 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1226 {
1227         struct tcphdr *th = skb->h.th;
1228         struct {
1229                 struct tcphdr th;
1230                 u32 tsopt[3];
1231         } rep;
1232         struct ip_reply_arg arg;
1233
1234         memset(&rep.th, 0, sizeof(struct tcphdr));
1235         memset(&arg, 0, sizeof arg);
1236
1237         arg.iov[0].iov_base = (unsigned char *)&rep;
1238         arg.iov[0].iov_len  = sizeof(rep.th);
1239         arg.n_iov = 1;
1240         if (ts) {
1241                 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1242                                                 (TCPOPT_NOP << 16) |
1243                                                 (TCPOPT_TIMESTAMP << 8) |
1244                                                 TCPOLEN_TIMESTAMP);
1245                 rep.tsopt[1] = htonl(tcp_time_stamp);
1246                 rep.tsopt[2] = htonl(ts);
1247                 arg.iov[0].iov_len = sizeof(rep);
1248         }
1249
1250         /* Swap the send and the receive. */
1251         rep.th.dest = th->source;
1252         rep.th.source = th->dest;
1253         rep.th.doff = arg.iov[0].iov_len/4;
1254         rep.th.seq = htonl(seq);
1255         rep.th.ack_seq = htonl(ack);
1256         rep.th.ack = 1;
1257         rep.th.window = htons(win);
1258
1259         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1260                                       skb->nh.iph->saddr, /*XXX*/
1261                                       arg.iov[0].iov_len,
1262                                       IPPROTO_TCP,
1263                                       0);
1264         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1265
1266         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1267
1268         tcp_statistics.TcpOutSegs++;
1269 }
1270
1271 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1272 {
1273         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1274
1275         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
1276
1277         tcp_tw_put(tw);
1278 }
1279
1280 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1281 {
1282         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
1283 }
1284
1285 /*
1286  *      Send a SYN-ACK after having received an ACK.
1287  *      This still operates on a open_request only, not on a big
1288  *      socket.
1289  */
1290 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1291 {
1292         struct rtable *rt;
1293         struct ip_options *opt;
1294         struct sk_buff * skb;
1295
1296         /* First, grab a route. */
1297         opt = req->af.v4_req.opt;
1298         if(ip_route_output(&rt, ((opt && opt->srr) ?
1299                                  opt->faddr :
1300                                  req->af.v4_req.rmt_addr),
1301                            req->af.v4_req.loc_addr,
1302                            RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1303                            sk->bound_dev_if)) {
1304                 ip_statistics.IpOutNoRoutes++;
1305                 return;
1306         }
1307         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1308                 ip_rt_put(rt);
1309                 ip_statistics.IpOutNoRoutes++;
1310                 return;
1311         }
1312
1313         skb = tcp_make_synack(sk, &rt->u.dst, req);
1314
1315         if (skb) {
1316                 struct tcphdr *th = skb->h.th;
1317
1318                 th->check = tcp_v4_check(th, skb->len,
1319                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1320                                          csum_partial((char *)th, skb->len, skb->csum));
1321
1322                 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1323                                       req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1324         }
1325         ip_rt_put(rt);
1326 }
1327
1328 /*
1329  *      IPv4 open_request destructor.
1330  */
1331 static void tcp_v4_or_free(struct open_request *req)
1332 {
1333         if(!req->sk && req->af.v4_req.opt)
1334                 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1335 }
1336
1337 static inline void syn_flood_warning(struct sk_buff *skb)
1338 {
1339         static unsigned long warntime;
1340
1341         if (jiffies - warntime > HZ*60) {
1342                 warntime = jiffies;
1343                 printk(KERN_INFO
1344                        "possible SYN flooding on port %d. Sending cookies.\n",
1345                        ntohs(skb->h.th->dest));
1346         }
1347 }
1348
1349 /*
1350  * Save and compile IPv4 options into the open_request if needed.
1351  */
1352 static inline struct ip_options *
1353 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1354 {
1355         struct ip_options *opt = &(IPCB(skb)->opt);
1356         struct ip_options *dopt = NULL;
1357
1358         if (opt && opt->optlen) {
1359                 int opt_size = optlength(opt);
1360                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1361                 if (dopt) {
1362                         if (ip_options_echo(dopt, skb)) {
1363                                 kfree_s(dopt, opt_size);
1364                                 dopt = NULL;
1365                         }
1366                 }
1367         }
1368         return dopt;
1369 }
1370
1371 /*
1372  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374  * It would be better to replace it with a global counter for all sockets
1375  * but then some measure against one socket starving all other sockets
1376  * would be needed.
1377  */
1378 int sysctl_max_syn_backlog = 128;
1379
1380 struct or_calltable or_ipv4 = {
1381         PF_INET,
1382         tcp_v4_send_synack,
1383         tcp_v4_or_send_ack,
1384         tcp_v4_or_free,
1385         tcp_v4_send_reset
1386 };
1387
1388 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1389 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1390
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393         struct tcp_opt tp;
1394         struct open_request *req;
1395         struct tcphdr *th = skb->h.th;
1396         __u32 saddr = skb->nh.iph->saddr;
1397         __u32 daddr = skb->nh.iph->daddr;
1398         __u32 isn = TCP_SKB_CB(skb)->when;
1399 #ifdef CONFIG_SYN_COOKIES
1400         int want_cookie = 0;
1401 #else
1402 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1403 #endif
1404
1405         /* Never answer to SYNs send to broadcast or multicast */
1406         if (((struct rtable *)skb->dst)->rt_flags &
1407             (RTCF_BROADCAST|RTCF_MULTICAST))
1408                 goto drop;
1409
1410         /* XXX: Check against a global syn pool counter. */
1411         if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1412 #ifdef CONFIG_SYN_COOKIES
1413                 if (sysctl_tcp_syncookies && !isn) {
1414                         syn_flood_warning(skb);
1415                         want_cookie = 1;
1416                 } else
1417 #endif
1418                 goto drop;
1419         } else {
1420                 if (isn == 0)
1421                         isn = tcp_v4_init_sequence(sk, skb);
1422                 BACKLOG(sk)++;
1423         }
1424
1425         req = tcp_openreq_alloc();
1426         if (req == NULL) {
1427                 goto dropbacklog;
1428         }
1429
1430         req->rcv_wnd = 0;               /* So that tcp_send_synack() knows! */
1431
1432         req->rcv_isn = TCP_SKB_CB(skb)->seq;
1433         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1434
1435         tp.mss_clamp = 536;
1436         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1437
1438         tcp_parse_options(NULL, th, &tp, want_cookie);
1439
1440         req->mss = tp.mss_clamp;
1441         req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
1442         req->tstamp_ok = tp.tstamp_ok;
1443         req->sack_ok = tp.sack_ok;
1444         req->snd_wscale = tp.snd_wscale;
1445         req->wscale_ok = tp.wscale_ok;
1446         req->rmt_port = th->source;
1447         req->af.v4_req.loc_addr = daddr;
1448         req->af.v4_req.rmt_addr = saddr;
1449
1450         /* Note that we ignore the isn passed from the TIME_WAIT
1451          * state here. That's the price we pay for cookies.
1452          *
1453          * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
1454          * and should reject connection attempt, duplicates with random
1455          * sequence number can corrupt data. Right?
1456          * I disabled sending cookie to request matching to a timewait
1457          * bucket.
1458          */
1459         if (want_cookie)
1460                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1461
1462         req->snt_isn = isn;
1463
1464         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1465
1466         req->class = &or_ipv4;
1467         req->retrans = 0;
1468         req->sk = NULL;
1469
1470         tcp_v4_send_synack(sk, req);
1471
1472         if (want_cookie) {
1473                 if (req->af.v4_req.opt)
1474                         kfree(req->af.v4_req.opt);
1475                 tcp_v4_or_free(req);
1476                 tcp_openreq_free(req);
1477         } else {
1478                 req->expires = jiffies + TCP_TIMEOUT_INIT;
1479                 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1480                 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1481         }
1482
1483         return 0;
1484
1485 dropbacklog:
1486         if (!want_cookie)
1487                 BACKLOG(sk)--;
1488 drop:
1489         tcp_statistics.TcpAttemptFails++;
1490         return 0;
1491 }
1492
1493
1494 /*
1495  * The three way handshake has completed - we got a valid synack -
1496  * now create the new socket.
1497  */
1498 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1499                                    struct open_request *req,
1500                                    struct dst_entry *dst)
1501 {
1502         struct ip_options *opt = req->af.v4_req.opt;
1503         struct tcp_opt *newtp;
1504         struct sock *newsk;
1505
1506         if (sk->ack_backlog > sk->max_ack_backlog)
1507                 goto exit; /* head drop */
1508         if (dst == NULL) {
1509                 struct rtable *rt;
1510
1511                 if (ip_route_output(&rt,
1512                         opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1513                         req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
1514                         return NULL;
1515                 dst = &rt->u.dst;
1516         }
1517
1518         newsk = tcp_create_openreq_child(sk, req, skb);
1519         if (!newsk)
1520                 goto exit;
1521
1522         sk->tp_pinfo.af_tcp.syn_backlog--;
1523         sk->ack_backlog++;
1524
1525         newsk->dst_cache = dst;
1526
1527         newtp = &(newsk->tp_pinfo.af_tcp);
1528         newsk->daddr = req->af.v4_req.rmt_addr;
1529         newsk->saddr = req->af.v4_req.loc_addr;
1530         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1531         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1532         newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
1533         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1534         newtp->ext_header_len = 0;
1535         if (newsk->protinfo.af_inet.opt)
1536                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1537
1538         tcp_sync_mss(newsk, dst->pmtu);
1539         tcp_initialize_rcv_mss(newsk);
1540
1541         if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
1542                 newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
1543         if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
1544                 newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
1545
1546         bh_lock_sock(newsk);
1547
1548         __tcp_v4_hash(newsk);
1549         __tcp_inherit_port(sk, newsk);
1550
1551         return newsk;
1552
1553 exit:
1554         dst_release(dst);
1555         return NULL;
1556 }
1557
1558
1559 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1560 {
1561         struct open_request *req, *prev;
1562         struct tcphdr *th = skb->h.th;
1563         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1564
1565         /* Find possible connection requests. */
1566         req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1567         if (req)
1568                 return tcp_check_req(sk, skb, req, prev);
1569
1570 #ifdef CONFIG_SYN_COOKIES
1571         if (!th->rst && (th->syn || th->ack))
1572                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1573 #endif
1574         return sk;
1575 }
1576
1577 static int tcp_csum_verify(struct sk_buff *skb)
1578 {
1579         switch (skb->ip_summed) {
1580         case CHECKSUM_NONE:
1581                 skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
1582         case CHECKSUM_HW:
1583                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1584                         NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1585                                         "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1586                                         "len=%d/%d\n",
1587                                         NIPQUAD(skb->nh.iph->saddr),
1588                                         ntohs(skb->h.th->source),
1589                                         NIPQUAD(skb->nh.iph->daddr),
1590                                         ntohs(skb->h.th->dest),
1591                                         skb->len,
1592                                         ntohs(skb->nh.iph->tot_len)));
1593                         return 1;
1594                 }
1595                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1596         default:
1597                 /* CHECKSUM_UNNECESSARY */
1598         }
1599         return 0;
1600 }
1601
1602
1603 /* The socket must have it's spinlock held when we get
1604  * here.
1605  *
1606  * We have a potential double-lock case here, so even when
1607  * doing backlog processing we use the BH locking scheme.
1608  * This is because we cannot sleep with the original spinlock
1609  * held.
1610  */
1611 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1612 {
1613 #ifdef CONFIG_FILTER
1614         struct sk_filter *filter = sk->filter;
1615         if (filter && sk_filter(skb, filter))
1616                 goto discard;
1617 #endif /* CONFIG_FILTER */
1618
1619         /*
1620          * This doesn't check if the socket has enough room for the packet.
1621          * Either process the packet _without_ queueing it and then free it,
1622          * or do the check later.
1623          */
1624         skb_set_owner_r(skb, sk);
1625
1626         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1627                 /* Ready to move deeper ... */
1628                 if (tcp_csum_verify(skb))
1629                         goto csum_err;
1630                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1631                         goto reset;
1632                 return 0;
1633         }
1634
1635         if (tcp_csum_verify(skb))
1636                 goto csum_err;
1637
1638         if (sk->state == TCP_LISTEN) {
1639                 struct sock *nsk;
1640
1641                 nsk = tcp_v4_hnd_req(sk, skb);
1642                 if (!nsk)
1643                         goto discard;
1644
1645                 /*
1646                  * Queue it on the new socket if the new socket is active,
1647                  * otherwise we just shortcircuit this and continue with
1648                  * the new socket..
1649                  */
1650                 if (nsk != sk) {
1651                         int ret;
1652                         int state = nsk->state;
1653
1654                         skb_orphan(skb);
1655
1656                         BUG_TRAP(nsk->lock.users == 0);
1657                         skb_set_owner_r(skb, nsk);
1658                         ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
1659
1660                         /* Wakeup parent, send SIGIO, if this packet changed
1661                            socket state from SYN-RECV.
1662
1663                            It still looks ugly, however it is much better
1664                            than miracleous double wakeup in syn_recv_sock()
1665                            and tcp_rcv_state_process().
1666                          */
1667                         if (state == TCP_SYN_RECV && nsk->state != state)
1668                                 sk->data_ready(sk, 0);
1669
1670                         bh_unlock_sock(nsk);
1671                         if (ret)
1672                                 goto reset;
1673                         return 0;
1674                 }
1675         }
1676
1677         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1678                 goto reset;
1679         return 0;
1680
1681 reset:
1682         tcp_v4_send_reset(skb);
1683 discard:
1684         kfree_skb(skb);
1685         /* Be careful here. If this function gets more complicated and
1686          * gcc suffers from register pressure on the x86, sk (in %ebx)
1687          * might be destroyed here. This current version compiles correctly,
1688          * but you have been warned.
1689          */
1690         return 0;
1691
1692 csum_err:
1693         tcp_statistics.TcpInErrs++;
1694         goto discard;
1695 }
1696
1697 /*
1698  *      From tcp_input.c
1699  */
1700
1701 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1702 {
1703         struct tcphdr *th;
1704         struct sock *sk;
1705         int ret;
1706
1707         if (skb->pkt_type!=PACKET_HOST)
1708                 goto discard_it;
1709
1710         th = skb->h.th;
1711
1712         /* Pull up the IP header. */
1713         __skb_pull(skb, skb->h.raw - skb->data);
1714
1715         /* Count it even if it's bad */
1716         tcp_statistics.TcpInSegs++;
1717
1718         if (len < sizeof(struct tcphdr))
1719                 goto bad_packet;
1720
1721         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1722         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1723                                     len - th->doff*4);
1724         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1725         TCP_SKB_CB(skb)->when = 0;
1726         skb->used = 0;
1727
1728         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1729                              skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
1730
1731         if (!sk)
1732                 goto no_tcp_socket;
1733
1734 process:
1735         if(!ipsec_sk_policy(sk,skb))
1736                 goto discard_and_relse;
1737
1738         if (sk->state == TCP_TIME_WAIT)
1739                 goto do_time_wait;
1740
1741         bh_lock_sock(sk);
1742         ret = 0;
1743         if (!sk->lock.users)
1744                 ret = tcp_v4_do_rcv(sk, skb);
1745         else
1746                 sk_add_backlog(sk, skb);
1747         bh_unlock_sock(sk);
1748
1749         sock_put(sk);
1750
1751         return ret;
1752
1753 no_tcp_socket:
1754         if (tcp_csum_verify(skb)) {
1755 bad_packet:
1756                 tcp_statistics.TcpInErrs++;
1757         } else {
1758                 tcp_v4_send_reset(skb);
1759         }
1760
1761 discard_it:
1762         /* Discard frame. */
1763         kfree_skb(skb);
1764         return 0;
1765
1766 discard_and_relse:
1767         sock_put(sk);
1768         goto discard_it;
1769
1770 do_time_wait:
1771         if (tcp_csum_verify(skb)) {
1772                 tcp_statistics.TcpInErrs++;
1773                 goto discard_and_relse;
1774         }
1775         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1776                                           skb, th, skb->len)) {
1777         case TCP_TW_SYN:
1778         {
1779                 struct sock *sk2;
1780
1781                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
1782                 if (sk2 != NULL) {
1783                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1784                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1785                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1786                         sk = sk2;
1787                         goto process;
1788                 }
1789                 /* Fall through to ACK */
1790         }
1791         case TCP_TW_ACK:
1792                 tcp_v4_timewait_ack(sk, skb);
1793                 break;
1794         case TCP_TW_RST:
1795                 goto no_tcp_socket;
1796         case TCP_TW_SUCCESS:
1797         }
1798         goto discard_it;
1799 }
1800
1801 static void __tcp_v4_rehash(struct sock *sk)
1802 {
1803         struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
1804         struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
1805         struct sock **skp = &head->chain;
1806
1807         write_lock_bh(&oldhead->lock);
1808         if(sk->pprev) {
1809                 if(sk->next)
1810                         sk->next->pprev = sk->pprev;
1811                 *sk->pprev = sk->next;
1812                 sk->pprev = NULL;
1813         }
1814         write_unlock(&oldhead->lock);
1815         write_lock(&head->lock);
1816         if((sk->next = *skp) != NULL)
1817                 (*skp)->pprev = &sk->next;
1818         *skp = sk;
1819         sk->pprev = skp;
1820         write_unlock_bh(&head->lock);
1821 }
1822
1823 int tcp_v4_rebuild_header(struct sock *sk)
1824 {
1825         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1826         __u32 new_saddr;
1827         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1828
1829         if(rt == NULL)
1830                 return 0;
1831
1832         /* Force route checking if want_rewrite.
1833          * The idea is good, the implementation is disguisting.
1834          * Well, if I made bind on this socket, you cannot randomly ovewrite
1835          * its source address. --ANK
1836          */
1837         if (want_rewrite) {
1838                 int tmp;
1839                 struct rtable *new_rt;
1840                 __u32 old_saddr = rt->rt_src;
1841
1842                 /* Query new route using another rt buffer */
1843                 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1844                                         RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1845                                         sk->bound_dev_if);
1846
1847                 /* Only useful if different source addrs */
1848                 if (tmp == 0) {
1849                         /*
1850                          *      Only useful if different source addrs
1851                          */
1852                         if (new_rt->rt_src != old_saddr ) {
1853                                 __sk_dst_set(sk, &new_rt->u.dst);
1854                                 rt = new_rt;
1855                                 goto do_rewrite;
1856                         }
1857                         dst_release(&new_rt->u.dst);
1858                 }
1859         }
1860         if (rt->u.dst.obsolete) {
1861                 int err;
1862                 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1863                 if (err) {
1864                         sk->err_soft=-err;
1865                         sk->error_report(sk);
1866                         return -1;
1867                 }
1868                 __sk_dst_set(sk, &rt->u.dst);
1869         }
1870
1871         return 0;
1872
1873 do_rewrite:
1874         new_saddr = rt->rt_src;
1875
1876         /* Ouch!, this should not happen. */
1877         if (!sk->saddr || !sk->rcv_saddr) {
1878                 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1879                        "saddr=%08lX rcv_saddr=%08lX\n",
1880                        ntohl(sk->saddr),
1881                        ntohl(sk->rcv_saddr));
1882                 return 0;
1883         }
1884
1885         if (new_saddr != sk->saddr) {
1886                 if (sysctl_ip_dynaddr > 1) {
1887                         printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1888                                "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1889                                NIPQUAD(sk->saddr),
1890                                NIPQUAD(new_saddr));
1891                 }
1892
1893                 sk->saddr = new_saddr;
1894                 sk->rcv_saddr = new_saddr;
1895
1896                 /* XXX The only one ugly spot where we need to
1897                  * XXX really change the sockets identity after
1898                  * XXX it has entered the hashes. -DaveM
1899                  *
1900                  * Besides that, it does not check for connetion
1901                  * uniqueness. Wait for troubles.
1902                  */
1903                 __tcp_v4_rehash(sk);
1904         }
1905
1906         return 0;
1907 }
1908
1909 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1910 {
1911         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1912
1913         sin->sin_family         = AF_INET;
1914         sin->sin_addr.s_addr    = sk->daddr;
1915         sin->sin_port           = sk->dport;
1916 }
1917
1918 struct tcp_func ipv4_specific = {
1919         ip_queue_xmit,
1920         tcp_v4_send_check,
1921         tcp_v4_rebuild_header,
1922         tcp_v4_conn_request,
1923         tcp_v4_syn_recv_sock,
1924         tcp_v4_hash_connecting,
1925         sizeof(struct iphdr),
1926
1927         ip_setsockopt,
1928         ip_getsockopt,
1929         v4_addr2sockaddr,
1930         sizeof(struct sockaddr_in)
1931 };
1932
1933 /* NOTE: A lot of things set to zero explicitly by call to
1934  *       sk_alloc() so need not be done here.
1935  */
1936 static int tcp_v4_init_sock(struct sock *sk)
1937 {
1938         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1939
1940         skb_queue_head_init(&tp->out_of_order_queue);
1941         tcp_init_xmit_timers(sk);
1942
1943         tp->rto  = TCP_TIMEOUT_INIT;
1944         tp->mdev = TCP_TIMEOUT_INIT;
1945
1946         /* So many TCP implementations out there (incorrectly) count the
1947          * initial SYN frame in their delayed-ACK and congestion control
1948          * algorithms that we must have the following bandaid to talk
1949          * efficiently to them.  -DaveM
1950          */
1951         tp->snd_cwnd = 2;
1952
1953         /* See draft-stevens-tcpca-spec-01 for discussion of the
1954          * initialization of these values.
1955          */
1956         tp->snd_cwnd_cnt = 0;
1957         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1958         tp->snd_cwnd_clamp = ~0;
1959         tp->mss_cache = 536;
1960
1961         sk->state = TCP_CLOSE;
1962         sk->max_ack_backlog = SOMAXCONN;
1963
1964         sk->write_space = tcp_write_space;
1965
1966         /* Init SYN queue. */
1967         tcp_synq_init(tp);
1968
1969         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1970
1971         return 0;
1972 }
1973
1974 static int tcp_v4_destroy_sock(struct sock *sk)
1975 {
1976         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1977
1978         tcp_clear_xmit_timers(sk);
1979
1980         /* Cleanup up the write buffer. */
1981         __skb_queue_purge(&sk->write_queue);
1982
1983         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1984         __skb_queue_purge(&tp->out_of_order_queue);
1985
1986         /* Clean up a referenced TCP bind bucket, this only happens if a
1987          * port is allocated for a socket, but it never fully connects.
1988          */
1989         if(sk->prev != NULL)
1990                 tcp_put_port(sk);
1991
1992         return 0;
1993 }
1994
1995 /* Proc filesystem TCP sock list dumping. */
1996 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
1997 {
1998         sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
1999                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2000                 i,
2001                 (long unsigned int)req->af.v4_req.loc_addr,
2002                 ntohs(sk->sport),
2003                 (long unsigned int)req->af.v4_req.rmt_addr,
2004                 ntohs(req->rmt_port),
2005                 TCP_SYN_RECV,
2006                 0,0, /* could print option size, but that is af dependent. */
2007                 1,   /* timers active (only the expire timer) */
2008                 (unsigned long)(req->expires - jiffies),
2009                 req->retrans,
2010                 sk->socket ? sk->socket->inode->i_uid : 0,
2011                 0,  /* non standard timer */
2012                 0, /* open_requests have no inode */
2013                 atomic_read(&sk->refcnt),
2014                 req
2015                 );
2016 }
2017
2018 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2019 {
2020         unsigned int dest, src;
2021         __u16 destp, srcp;
2022         int timer_active, timer_active1, timer_active2;
2023         unsigned long timer_expires;
2024         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2025
2026         dest  = sp->daddr;
2027         src   = sp->rcv_saddr;
2028         destp = ntohs(sp->dport);
2029         srcp  = ntohs(sp->sport);
2030         timer_active1 = tp->retransmit_timer.prev != NULL;
2031         timer_active2 = sp->timer.prev != NULL;
2032         timer_active    = 0;
2033         timer_expires   = (unsigned) -1;
2034         if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
2035                 timer_active    = 1;
2036                 timer_expires   = tp->retransmit_timer.expires;
2037         }
2038         if (timer_active2 && sp->timer.expires < timer_expires) {
2039                 timer_active    = 2;
2040                 timer_expires   = sp->timer.expires;
2041         }
2042         if(timer_active == 0)
2043                 timer_expires = jiffies;
2044
2045         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2046                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
2047                 i, src, srcp, dest, destp, sp->state,
2048                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2049                 timer_active, timer_expires-jiffies,
2050                 tp->retransmits,
2051                 sp->socket ? sp->socket->inode->i_uid : 0,
2052                 0,
2053                 sp->socket ? sp->socket->inode->i_ino : 0,
2054                 atomic_read(&sp->refcnt), sp);
2055 }
2056
2057 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2058 {
2059         unsigned int dest, src;
2060         __u16 destp, srcp;
2061         int slot_dist;
2062
2063         dest  = tw->daddr;
2064         src   = tw->rcv_saddr;
2065         destp = ntohs(tw->dport);
2066         srcp  = ntohs(tw->sport);
2067
2068         slot_dist = tw->death_slot;
2069         if(slot_dist > tcp_tw_death_row_slot)
2070                 slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
2071         else
2072                 slot_dist = tcp_tw_death_row_slot - slot_dist;
2073
2074         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2075                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2076                 i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
2077                 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
2078                 atomic_read(&tw->refcnt), tw);
2079 }
2080
2081 int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
2082 {
2083         int len = 0, num = 0, i;
2084         off_t begin, pos = 0;
2085         char tmpbuf[129];
2086
2087         if (offset < 128)
2088                 len += sprintf(buffer, "%-127s\n",
2089                                "  sl  local_address rem_address   st tx_queue "
2090                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2091
2092         pos = 128;
2093
2094         /* First, walk listening socket table. */
2095         tcp_listen_lock();
2096         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2097                 struct sock *sk = tcp_listening_hash[i];
2098
2099                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2100                         struct open_request *req;
2101                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2102
2103                         if (!TCP_INET_FAMILY(sk->family))
2104                                 goto skip_listen;
2105
2106                         pos += 128;
2107                         if (pos >= offset) {
2108                                 get_tcp_sock(sk, tmpbuf, num);
2109                                 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2110                                 if (len >= length) {
2111                                         tcp_listen_unlock();
2112                                         goto out_no_bh;
2113                                 }
2114                         }
2115
2116 skip_listen:
2117                         lock_sock(sk);
2118                         for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
2119                                 if (req->sk)
2120                                         continue;
2121                                 if (!TCP_INET_FAMILY(req->class->family))
2122                                         continue;
2123
2124                                 pos += 128;
2125                                 if (pos < offset)
2126                                         continue;
2127                                 get_openreq(sk, req, tmpbuf, num);
2128                                 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2129                                 if(len >= length) {
2130                                         tcp_listen_unlock();
2131                                         release_sock(sk);
2132                                         goto out_no_bh;
2133                                 }
2134                         }
2135                         release_sock(sk);
2136                 }
2137         }
2138         tcp_listen_unlock();
2139
2140         local_bh_disable();
2141
2142         /* Next, walk established hash chain. */
2143         for (i = 0; i < tcp_ehash_size; i++) {
2144                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2145                 struct sock *sk;
2146                 struct tcp_tw_bucket *tw;
2147
2148                 read_lock(&head->lock);
2149                 for(sk = head->chain; sk; sk = sk->next, num++) {
2150                         if (!TCP_INET_FAMILY(sk->family))
2151                                 continue;
2152                         pos += 128;
2153                         if (pos < offset)
2154                                 continue;
2155                         get_tcp_sock(sk, tmpbuf, num);
2156                         len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2157                         if(len >= length) {
2158                                 read_unlock(&head->lock);
2159                                 goto out;
2160                         }
2161                 }
2162                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2163                      tw != NULL;
2164                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2165                         if (!TCP_INET_FAMILY(tw->family))
2166                                 continue;
2167                         pos += 128;
2168                         if (pos < offset)
2169                                 continue;
2170                         get_timewait_sock(tw, tmpbuf, num);
2171                         len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2172                         if(len >= length) {
2173                                 read_unlock(&head->lock);
2174                                 goto out;
2175                         }
2176                 }
2177                 read_unlock(&head->lock);
2178         }
2179
2180 out:
2181         local_bh_enable();
2182 out_no_bh:
2183
2184         begin = len - (pos - offset);
2185         *start = buffer + begin;
2186         len -= begin;
2187         if(len > length)
2188                 len = length;
2189         if (len < 0)
2190                 len = 0;
2191         return len;
2192 }
2193
2194 struct proto tcp_prot = {
2195         tcp_close,                      /* close */
2196         tcp_v4_connect,                 /* connect */
2197         tcp_disconnect,                 /* disconnect */
2198         tcp_accept,                     /* accept */
2199         NULL,                           /* retransmit */
2200         tcp_write_wakeup,               /* write_wakeup */
2201         tcp_read_wakeup,                /* read_wakeup */
2202         tcp_poll,                       /* poll */
2203         tcp_ioctl,                      /* ioctl */
2204         tcp_v4_init_sock,               /* init */
2205         tcp_v4_destroy_sock,            /* destroy */
2206         tcp_shutdown,                   /* shutdown */
2207         tcp_setsockopt,                 /* setsockopt */
2208         tcp_getsockopt,                 /* getsockopt */
2209         tcp_v4_sendmsg,                 /* sendmsg */
2210         tcp_recvmsg,                    /* recvmsg */
2211         NULL,                           /* bind */
2212         tcp_v4_do_rcv,                  /* backlog_rcv */
2213         tcp_v4_hash,                    /* hash */
2214         tcp_unhash,                     /* unhash */
2215         tcp_v4_get_port,                /* get_port */
2216         128,                            /* max_header */
2217         0,                              /* retransmits */
2218         "TCP",                          /* name */
2219         0,                              /* inuse */
2220         0                               /* highestinuse */
2221 };
2222
2223
2224
2225 void __init tcp_v4_init(struct net_proto_family *ops)
2226 {
2227         int err;
2228
2229         tcp_inode.i_mode = S_IFSOCK;
2230         tcp_inode.i_sock = 1;
2231         tcp_inode.i_uid = 0;
2232         tcp_inode.i_gid = 0;
2233         init_waitqueue_head(&tcp_inode.i_wait);
2234         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2235
2236         tcp_socket->inode = &tcp_inode;
2237         tcp_socket->state = SS_UNCONNECTED;
2238         tcp_socket->type=SOCK_RAW;
2239
2240         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2241                 panic("Failed to create the TCP control socket.\n");
2242         tcp_socket->sk->allocation=GFP_ATOMIC;
2243         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2244
2245         /* Unhash it so that IP input processing does not even
2246          * see it, we do not wish this socket to see incoming
2247          * packets.
2248          */
2249         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2250 }