net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.222 2000/12/08 17:15:53 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/cache.h>
  55 #include <linux/init.h>
  56
  57 #include <net/icmp.h>
  58 #include <net/tcp.h>
  59 #include <net/ipv6.h>
  60 #include <net/inet_common.h>
  61
  62 #include <linux/inet.h>
  63 #include <linux/stddef.h>
  64 #include <linux/ipsec.h>
  65
  66 extern int sysctl_ip_dynaddr;
  67
  68 /* Check TCP sequence numbers in ICMP packets. */
  69 #define ICMP_MIN_LENGTH 8
  70
  71 /* Socket used for sending RSTs */
  72 static struct inode tcp_inode;
  73 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
  74
  75 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  76                        struct sk_buff *skb);
  77
  78 /*
  79  * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
  80  */
  81 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  82         __tcp_ehash:          NULL,
  83         __tcp_bhash:          NULL,
  84         __tcp_bhash_size:     0,
  85         __tcp_ehash_size:     0,
  86         __tcp_listening_hash: { NULL, },
  87         __tcp_lhash_lock:     RW_LOCK_UNLOCKED,
  88         __tcp_lhash_users:    ATOMIC_INIT(0),
  89         __tcp_lhash_wait:
  90           __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  91         __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
  92 };
  93
  94 /*
  95  * This array holds the first and last local port number.
  96  * For high-usage systems, use sysctl to change this to
  97  * 32768-61000
  98  */
  99 int sysctl_local_port_range[2] = { 1024, 4999 };
 100 int tcp_port_rover = (1024 - 1);
 101
 102 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 103                                  __u32 faddr, __u16 fport)
 104 {
 105         int h = ((laddr ^ lport) ^ (faddr ^ fport));
 106         h ^= h>>16;
 107         h ^= h>>8;
 108         return h & (tcp_ehash_size - 1);
 109 }
 110
 111 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 112 {
 113         __u32 laddr = sk->rcv_saddr;
 114         __u16 lport = sk->num;
 115         __u32 faddr = sk->daddr;
 116         __u16 fport = sk->dport;
 117
 118         return tcp_hashfn(laddr, lport, faddr, fport);
 119 }
 120
 121 /* Allocate and initialize a new TCP local port bind bucket.
 122  * The bindhash mutex for snum's hash chain must be held here.
 123  */
 124 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 125                                           unsigned short snum)
 126 {
 127         struct tcp_bind_bucket *tb;
 128
 129         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 130         if(tb != NULL) {
 131                 tb->port = snum;
 132                 tb->fastreuse = 0;
 133                 tb->owners = NULL;
 134                 if((tb->next = head->chain) != NULL)
 135                         tb->next->pprev = &tb->next;
 136                 head->chain = tb;
 137                 tb->pprev = &head->chain;
 138         }
 139         return tb;
 140 }
 141
 142 /* Caller must disable local BH processing. */
 143 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 144 {
 145         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
 146         struct tcp_bind_bucket *tb;
 147
 148         spin_lock(&head->lock);
 149         tb = (struct tcp_bind_bucket *)sk->prev;
 150         if ((child->bind_next = tb->owners) != NULL)
 151                 tb->owners->bind_pprev = &child->bind_next;
 152         tb->owners = child;
 153         child->bind_pprev = &tb->owners;
 154         child->prev = (struct sock *) tb;
 155         spin_unlock(&head->lock);
 156 }
 157
 158 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
 159 {
 160         local_bh_disable();
 161         __tcp_inherit_port(sk, child);
 162         local_bh_enable();
 163 }
 164
 165 /* Obtain a reference to a local port for the given sock,
 166  * if snum is zero it means select any available local port.
 167  */
 168 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 169 {
 170         struct tcp_bind_hashbucket *head;
 171         struct tcp_bind_bucket *tb;
 172         int ret;
 173
 174         local_bh_disable();
 175         if (snum == 0) {
 176                 int low = sysctl_local_port_range[0];
 177                 int high = sysctl_local_port_range[1];
 178                 int remaining = (high - low) + 1;
 179                 int rover;
 180
 181                 spin_lock(&tcp_portalloc_lock);
 182                 rover = tcp_port_rover;
 183                 do {    rover++;
 184                         if ((rover < low) || (rover > high))
 185                                 rover = low;
 186                         head = &tcp_bhash[tcp_bhashfn(rover)];
 187                         spin_lock(&head->lock);
 188                         for (tb = head->chain; tb; tb = tb->next)
 189                                 if (tb->port == rover)
 190                                         goto next;
 191                         break;
 192                 next:
 193                         spin_unlock(&head->lock);
 194                 } while (--remaining > 0);
 195                 tcp_port_rover = rover;
 196                 spin_unlock(&tcp_portalloc_lock);
 197
 198                 /* Exhausted local port range during search? */
 199                 ret = 1;
 200                 if (remaining <= 0)
 201                         goto fail;
 202
 203                 /* OK, here is the one we will use.  HEAD is
 204                  * non-NULL and we hold it's mutex.
 205                  */
 206                 snum = rover;
 207                 tb = NULL;
 208         } else {
 209                 head = &tcp_bhash[tcp_bhashfn(snum)];
 210                 spin_lock(&head->lock);
 211                 for (tb = head->chain; tb != NULL; tb = tb->next)
 212                         if (tb->port == snum)
 213                                 break;
 214         }
 215         if (tb != NULL && tb->owners != NULL) {
 216                 if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
 217                         goto success;
 218                 } else {
 219                         struct sock *sk2 = tb->owners;
 220                         int sk_reuse = sk->reuse;
 221
 222                         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
 223                                 if (sk != sk2 &&
 224                                     sk->bound_dev_if == sk2->bound_dev_if) {
 225                                         if (!sk_reuse   ||
 226                                             !sk2->reuse ||
 227                                             sk2->state == TCP_LISTEN) {
 228                                                 if (!sk2->rcv_saddr     ||
 229                                                     !sk->rcv_saddr      ||
 230                                                     (sk2->rcv_saddr == sk->rcv_saddr))
 231                                                         break;
 232                                         }
 233                                 }
 234                         }
 235                         /* If we found a conflict, fail. */
 236                         ret = 1;
 237                         if (sk2 != NULL)
 238                                 goto fail_unlock;
 239                 }
 240         }
 241         ret = 1;
 242         if (tb == NULL &&
 243             (tb = tcp_bucket_create(head, snum)) == NULL)
 244                         goto fail_unlock;
 245         if (tb->owners == NULL) {
 246                 if (sk->reuse && sk->state != TCP_LISTEN)
 247                         tb->fastreuse = 1;
 248                 else
 249                         tb->fastreuse = 0;
 250         } else if (tb->fastreuse &&
 251                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
 252                 tb->fastreuse = 0;
 253 success:
 254         sk->num = snum;
 255         if (sk->prev == NULL) {
 256                 if ((sk->bind_next = tb->owners) != NULL)
 257                         tb->owners->bind_pprev = &sk->bind_next;
 258                 tb->owners = sk;
 259                 sk->bind_pprev = &tb->owners;
 260                 sk->prev = (struct sock *) tb;
 261         } else {
 262                 BUG_TRAP(sk->prev == (struct sock *) tb);
 263         }
 264         ret = 0;
 265
 266 fail_unlock:
 267         spin_unlock(&head->lock);
 268 fail:
 269         local_bh_enable();
 270         return ret;
 271 }
 272
 273 /* Get rid of any references to a local port held by the
 274  * given sock.
 275  */
 276 __inline__ void __tcp_put_port(struct sock *sk)
 277 {
 278         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
 279         struct tcp_bind_bucket *tb;
 280
 281         spin_lock(&head->lock);
 282         tb = (struct tcp_bind_bucket *) sk->prev;
 283         if (sk->bind_next)
 284                 sk->bind_next->bind_pprev = sk->bind_pprev;
 285         *(sk->bind_pprev) = sk->bind_next;
 286         sk->prev = NULL;
 287         sk->num = 0;
 288         if (tb->owners == NULL) {
 289                 if (tb->next)
 290                         tb->next->pprev = tb->pprev;
 291                 *(tb->pprev) = tb->next;
 292                 kmem_cache_free(tcp_bucket_cachep, tb);
 293         }
 294         spin_unlock(&head->lock);
 295 }
 296
 297 void tcp_put_port(struct sock *sk)
 298 {
 299         local_bh_disable();
 300         __tcp_put_port(sk);
 301         local_bh_enable();
 302 }
 303
 304 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 305  * Look, when several writers sleep and reader wakes them up, all but one
 306  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 307  * this, _but_ remember, it adds useless work on UP machines (wake up each
 308  * exclusive lock release). It should be ifdefed really.
 309  */
 310
 311 void tcp_listen_wlock(void)
 312 {
 313         write_lock(&tcp_lhash_lock);
 314
 315         if (atomic_read(&tcp_lhash_users)) {
 316                 DECLARE_WAITQUEUE(wait, current);
 317
 318                 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
 319                 for (;;) {
 320                         set_current_state(TASK_UNINTERRUPTIBLE);
 321                         if (atomic_read(&tcp_lhash_users) == 0)
 322                                 break;
 323                         write_unlock_bh(&tcp_lhash_lock);
 324                         schedule();
 325                         write_lock_bh(&tcp_lhash_lock);
 326                 }
 327
 328                 __set_current_state(TASK_RUNNING);
 329                 remove_wait_queue(&tcp_lhash_wait, &wait);
 330         }
 331 }
 332
 333 static __inline__ void __tcp_v4_hash(struct sock *sk)
 334 {
 335         struct sock **skp;
 336         rwlock_t *lock;
 337
 338         BUG_TRAP(sk->pprev==NULL);
 339         if(sk->state == TCP_LISTEN) {
 340                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 341                 lock = &tcp_lhash_lock;
 342                 tcp_listen_wlock();
 343         } else {
 344                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
 345                 lock = &tcp_ehash[sk->hashent].lock;
 346                 write_lock(lock);
 347         }
 348         if((sk->next = *skp) != NULL)
 349                 (*skp)->pprev = &sk->next;
 350         *skp = sk;
 351         sk->pprev = skp;
 352         sock_prot_inc_use(sk->prot);
 353         write_unlock(lock);
 354         if (sk->state == TCP_LISTEN)
 355                 wake_up(&tcp_lhash_wait);
 356 }
 357
 358 static void tcp_v4_hash(struct sock *sk)
 359 {
 360         if (sk->state != TCP_CLOSE) {
 361                 local_bh_disable();
 362                 __tcp_v4_hash(sk);
 363                 local_bh_enable();
 364         }
 365 }
 366
 367 void tcp_unhash(struct sock *sk)
 368 {
 369         rwlock_t *lock;
 370
 371         if (sk->state == TCP_LISTEN) {
 372                 local_bh_disable();
 373                 tcp_listen_wlock();
 374                 lock = &tcp_lhash_lock;
 375         } else {
 376                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
 377                 lock = &head->lock;
 378                 write_lock_bh(&head->lock);
 379         }
 380
 381         if(sk->pprev) {
 382                 if(sk->next)
 383                         sk->next->pprev = sk->pprev;
 384                 *sk->pprev = sk->next;
 385                 sk->pprev = NULL;
 386                 sock_prot_dec_use(sk->prot);
 387         }
 388         write_unlock_bh(lock);
 389         if (sk->state == TCP_LISTEN)
 390                 wake_up(&tcp_lhash_wait);
 391 }
 392
 393 /* Don't inline this cruft.  Here are some nice properties to
 394  * exploit here.  The BSD API does not allow a listening TCP
 395  * to specify the remote port nor the remote address for the
 396  * connection.  So always assume those are both wildcarded
 397  * during the search since they can never be otherwise.
 398  */
 399 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
 400 {
 401         struct sock *result = NULL;
 402         int score, hiscore;
 403
 404         hiscore=0;
 405         for(; sk; sk = sk->next) {
 406                 if(sk->num == hnum) {
 407                         __u32 rcv_saddr = sk->rcv_saddr;
 408
 409                         score = 1;
 410                         if(rcv_saddr) {
 411                                 if (rcv_saddr != daddr)
 412                                         continue;
 413                                 score++;
 414                         }
 415                         if (sk->bound_dev_if) {
 416                                 if (sk->bound_dev_if != dif)
 417                                         continue;
 418                                 score++;
 419                         }
 420                         if (score == 3)
 421                                 return sk;
 422                         if (score > hiscore) {
 423                                 hiscore = score;
 424                                 result = sk;
 425                         }
 426                 }
 427         }
 428         return result;
 429 }
 430
 431 /* Optimize the common listener case. */
 432 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 433 {
 434         struct sock *sk;
 435
 436         read_lock(&tcp_lhash_lock);
 437         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
 438         if (sk) {
 439                 if (sk->num == hnum &&
 440                     sk->next == NULL &&
 441                     (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
 442                     !sk->bound_dev_if)
 443                         goto sherry_cache;
 444                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
 445         }
 446         if (sk) {
 447 sherry_cache:
 448                 sock_hold(sk);
 449         }
 450         read_unlock(&tcp_lhash_lock);
 451         return sk;
 452 }
 453
 454 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 455  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 456  *
 457  * Local BH must be disabled here.
 458  */
 459
 460 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 461                                                        u32 daddr, u16 hnum, int dif)
 462 {
 463         struct tcp_ehash_bucket *head;
 464         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 465         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 466         struct sock *sk;
 467         int hash;
 468
 469         /* Optimize here for direct hit, only listening connections can
 470          * have wildcards anyways.
 471          */
 472         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 473         head = &tcp_ehash[hash];
 474         read_lock(&head->lock);
 475         for(sk = head->chain; sk; sk = sk->next) {
 476                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 477                         goto hit; /* You sunk my battleship! */
 478         }
 479
 480         /* Must check for a TIME_WAIT'er before going to listener hash. */
 481         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
 482                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 483                         goto hit;
 484         read_unlock(&head->lock);
 485
 486         return NULL;
 487
 488 hit:
 489         sock_hold(sk);
 490         read_unlock(&head->lock);
 491         return sk;
 492 }
 493
 494 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 495                                            u32 daddr, u16 hnum, int dif)
 496 {
 497         struct sock *sk;
 498
 499         sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
 500
 501         if (sk)
 502                 return sk;
 503
 504         return tcp_v4_lookup_listener(daddr, hnum, dif);
 505 }
 506
 507 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 508 {
 509         struct sock *sk;
 510
 511         local_bh_disable();
 512         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 513         local_bh_enable();
 514
 515         return sk;
 516 }
 517
 518 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 519 {
 520         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 521                                           skb->nh.iph->saddr,
 522                                           skb->h.th->dest,
 523                                           skb->h.th->source);
 524 }
 525
 526 static int tcp_v4_check_established(struct sock *sk)
 527 {
 528         u32 daddr = sk->rcv_saddr;
 529         u32 saddr = sk->daddr;
 530         int dif = sk->bound_dev_if;
 531         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 532         __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
 533         int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 534         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 535         struct sock *sk2, **skp;
 536         struct tcp_tw_bucket *tw;
 537
 538         write_lock_bh(&head->lock);
 539
 540         /* Check TIME-WAIT sockets first. */
 541         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 542             skp = &sk2->next) {
 543                 tw = (struct tcp_tw_bucket*)sk2;
 544
 545                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 546                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 547
 548                         /* With PAWS, it is safe from the viewpoint
 549                            of data integrity. Even without PAWS it
 550                            is safe provided sequence spaces do not
 551                            overlap i.e. at data rates <= 80Mbit/sec.
 552
 553                            Actually, the idea is close to VJ's one,
 554                            only timestamp cache is held not per host,
 555                            but per port pair and TW bucket is used
 556                            as state holder.
 557
 558                            If TW bucket has been already destroyed we
 559                            fall back to VJ's scheme and use initial
 560                            timestamp retrieved from peer table.
 561                          */
 562                         if (tw->ts_recent_stamp) {
 563                                 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
 564                                         tp->write_seq = 1;
 565                                 tp->ts_recent = tw->ts_recent;
 566                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
 567                                 sock_hold(sk2);
 568                                 skp = &head->chain;
 569                                 goto unique;
 570                         } else
 571                                 goto not_unique;
 572                 }
 573         }
 574         tw = NULL;
 575
 576         /* And established part... */
 577         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
 578                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 579                         goto not_unique;
 580         }
 581
 582 unique:
 583         BUG_TRAP(sk->pprev==NULL);
 584         if ((sk->next = *skp) != NULL)
 585                 (*skp)->pprev = &sk->next;
 586
 587         *skp = sk;
 588         sk->pprev = skp;
 589         sk->hashent = hash;
 590         sock_prot_inc_use(sk->prot);
 591         write_unlock_bh(&head->lock);
 592
 593         if (tw) {
 594                 /* Silly. Should hash-dance instead... */
 595                 local_bh_disable();
 596                 tcp_tw_deschedule(tw);
 597                 tcp_timewait_kill(tw);
 598                 NET_INC_STATS_BH(TimeWaitRecycled);
 599                 local_bh_enable();
 600
 601                 tcp_tw_put(tw);
 602         }
 603
 604         return 0;
 605
 606 not_unique:
 607         write_unlock_bh(&head->lock);
 608         return -EADDRNOTAVAIL;
 609 }
 610
 611 /* Hash SYN-SENT socket to established hash table after
 612  * checking that it is unique. Note, that without kernel lock
 613  * we MUST make these two operations atomically.
 614  *
 615  * Optimization: if it is bound and tcp_bind_bucket has the only
 616  * owner (us), we need not to scan established bucket.
 617  */
 618
 619 int tcp_v4_hash_connecting(struct sock *sk)
 620 {
 621         unsigned short snum = sk->num;
 622         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
 623         struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
 624
 625         spin_lock_bh(&head->lock);
 626         if (tb->owners == sk && sk->bind_next == NULL) {
 627                 __tcp_v4_hash(sk);
 628                 spin_unlock_bh(&head->lock);
 629                 return 0;
 630         } else {
 631                 spin_unlock_bh(&head->lock);
 632
 633                 /* No definite answer... Walk to established hash table */
 634                 return tcp_v4_check_established(sk);
 635         }
 636 }
 637
 638 /* This will initiate an outgoing connection. */
 639 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 640 {
 641         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 642         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 643         struct sk_buff *buff;
 644         struct rtable *rt;
 645         u32 daddr, nexthop;
 646         int tmp;
 647         int err;
 648
 649         if (addr_len < sizeof(struct sockaddr_in))
 650                 return(-EINVAL);
 651
 652         if (usin->sin_family != AF_INET)
 653                 return(-EAFNOSUPPORT);
 654
 655         nexthop = daddr = usin->sin_addr.s_addr;
 656         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
 657                 if (daddr == 0)
 658                         return -EINVAL;
 659                 nexthop = sk->protinfo.af_inet.opt->faddr;
 660         }
 661
 662         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 663                                RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 664         if (tmp < 0)
 665                 return tmp;
 666
 667         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 668                 ip_rt_put(rt);
 669                 return -ENETUNREACH;
 670         }
 671
 672         __sk_dst_set(sk, &rt->u.dst);
 673
 674         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
 675                 daddr = rt->rt_dst;
 676
 677         err = -ENOBUFS;
 678         buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
 679
 680         if (buff == NULL)
 681                 goto failure;
 682
 683         if (!sk->saddr)
 684                 sk->saddr = rt->rt_src;
 685         sk->rcv_saddr = sk->saddr;
 686
 687         if (tp->ts_recent_stamp && sk->daddr != daddr) {
 688                 /* Reset inherited state */
 689                 tp->ts_recent = 0;
 690                 tp->ts_recent_stamp = 0;
 691                 tp->write_seq = 0;
 692         }
 693
 694         if (sysctl_tcp_tw_recycle &&
 695             !tp->ts_recent_stamp &&
 696             rt->rt_dst == daddr) {
 697                 struct inet_peer *peer = rt_get_peer(rt);
 698
 699                 /* VJ's idea. We save last timestamp seen from
 700                  * the destination in peer table, when entering state TIME-WAIT
 701                  * and initialize ts_recent from it, when trying new connection.
 702                  */
 703
 704                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 705                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
 706                         tp->ts_recent = peer->tcp_ts;
 707                 }
 708         }
 709
 710         sk->dport = usin->sin_port;
 711         sk->daddr = daddr;
 712
 713         if (!tp->write_seq)
 714                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 715                                                            sk->sport, usin->sin_port);
 716
 717         tp->ext_header_len = 0;
 718         if (sk->protinfo.af_inet.opt)
 719                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
 720
 721         tp->mss_clamp = 536;
 722
 723         err = tcp_connect(sk, buff);
 724         if (err == 0)
 725                 return 0;
 726
 727 failure:
 728         __sk_dst_reset(sk);
 729         sk->dport = 0;
 730         return err;
 731 }
 732
 733 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 734 {
 735         return ((struct rtable*)skb->dst)->rt_iif;
 736 }
 737
 738 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
 739 {
 740         unsigned h = raddr ^ rport;
 741         h ^= h>>16;
 742         h ^= h>>8;
 743         return h&(TCP_SYNQ_HSIZE-1);
 744 }
 745
 746 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 747                                               struct iphdr *iph,
 748                                               struct tcphdr *th,
 749                                               struct open_request ***prevp)
 750 {
 751         struct tcp_listen_opt *lopt = tp->listen_opt;
 752         struct open_request *req, **prev;
 753         __u16 rport = th->source;
 754         __u32 raddr = iph->saddr;
 755
 756         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
 757              (req = *prev) != NULL;
 758              prev = &req->dl_next) {
 759                 if (req->rmt_port == rport &&
 760                     req->af.v4_req.rmt_addr == raddr &&
 761                     req->af.v4_req.loc_addr == iph->daddr &&
 762                     TCP_INET_FAMILY(req->class->family)) {
 763                         BUG_TRAP(req->sk == NULL);
 764                         *prevp = prev;
 765                         return req;
 766                 }
 767         }
 768
 769         return NULL;
 770 }
 771
 772 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 773 {
 774         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 775         struct tcp_listen_opt *lopt = tp->listen_opt;
 776         unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
 777
 778         req->expires = jiffies + TCP_TIMEOUT_INIT;
 779         req->retrans = 0;
 780         req->sk = NULL;
 781         req->index = h;
 782         req->dl_next = lopt->syn_table[h];
 783
 784         write_lock(&tp->syn_wait_lock);
 785         lopt->syn_table[h] = req;
 786         write_unlock(&tp->syn_wait_lock);
 787
 788         tcp_synq_added(sk);
 789 }
 790
 791
 792 /*
 793  * This routine does path mtu discovery as defined in RFC1191.
 794  */
 795 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
 796 {
 797         struct dst_entry *dst;
 798         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 799
 800         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 801          * send out by Linux are always <576bytes so they should go through
 802          * unfragmented).
 803          */
 804         if (sk->state == TCP_LISTEN)
 805                 return;
 806
 807         /* We don't check in the destentry if pmtu discovery is forbidden
 808          * on this route. We just assume that no packet_to_big packets
 809          * are send back when pmtu discovery is not active.
 810          * There is a small race when the user changes this flag in the
 811          * route, but I think that's acceptable.
 812          */
 813         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 814                 return;
 815
 816         ip_rt_update_pmtu(dst, mtu);
 817
 818         /* Something is about to be wrong... Remember soft error
 819          * for the case, if this connection will not able to recover.
 820          */
 821         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
 822                 sk->err_soft = EMSGSIZE;
 823
 824         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
 825             tp->pmtu_cookie > dst->pmtu) {
 826                 tcp_sync_mss(sk, dst->pmtu);
 827
 828                 /* Resend the TCP packet because it's
 829                  * clear that the old packet has been
 830                  * dropped. This is the new "fast" path mtu
 831                  * discovery.
 832                  */
 833                 tcp_simple_retransmit(sk);
 834         } /* else let the usual retransmit timer handle it */
 835 }
 836
 837 /*
 838  * This routine is called by the ICMP module when it gets some
 839  * sort of error condition.  If err < 0 then the socket should
 840  * be closed and the error returned to the user.  If err > 0
 841  * it's just the icmp type << 8 | icmp code.  After adjustment
 842  * header points to the first 8 bytes of the tcp header.  We need
 843  * to find the appropriate port.
 844  *
 845  * The locking strategy used here is very "optimistic". When
 846  * someone else accesses the socket the ICMP is just dropped
 847  * and for some paths there is no check at all.
 848  * A more general error queue to queue errors for later handling
 849  * is probably better.
 850  *
 851  */
 852
 853 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 854 {
 855         struct iphdr *iph = (struct iphdr*)dp;
 856         struct tcphdr *th;
 857         struct tcp_opt *tp;
 858         int type = skb->h.icmph->type;
 859         int code = skb->h.icmph->code;
 860 #if ICMP_MIN_LENGTH < 14
 861         int no_flags = 0;
 862 #else
 863 #define no_flags 0
 864 #endif
 865         struct sock *sk;
 866         __u32 seq;
 867         int err;
 868
 869         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 870                 ICMP_INC_STATS_BH(IcmpInErrors);
 871                 return;
 872         }
 873 #if ICMP_MIN_LENGTH < 14
 874         if (len < (iph->ihl << 2) + 14)
 875                 no_flags = 1;
 876 #endif
 877
 878         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 879
 880         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
 881         if (sk == NULL) {
 882                 ICMP_INC_STATS_BH(IcmpInErrors);
 883                 return;
 884         }
 885         if (sk->state == TCP_TIME_WAIT) {
 886                 tcp_tw_put((struct tcp_tw_bucket*)sk);
 887                 return;
 888         }
 889
 890         bh_lock_sock(sk);
 891         /* If too many ICMPs get dropped on busy
 892          * servers this needs to be solved differently.
 893          */
 894         if (sk->lock.users != 0)
 895                 NET_INC_STATS_BH(LockDroppedIcmps);
 896
 897         if (sk->state == TCP_CLOSE)
 898                 goto out;
 899
 900         tp = &sk->tp_pinfo.af_tcp;
 901         seq = ntohl(th->seq);
 902         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
 903                 NET_INC_STATS(OutOfWindowIcmps);
 904                 goto out;
 905         }
 906
 907         switch (type) {
 908         case ICMP_SOURCE_QUENCH:
 909                 /* This is deprecated, but if someone generated it,
 910                  * we have no reasons to ignore it.
 911                  */
 912                 if (sk->lock.users == 0)
 913                         tcp_enter_cwr(tp);
 914                 goto out;
 915         case ICMP_PARAMETERPROB:
 916                 err = EPROTO;
 917                 break;
 918         case ICMP_DEST_UNREACH:
 919                 if (code > NR_ICMP_UNREACH)
 920                         goto out;
 921
 922                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 923                         if (sk->lock.users == 0)
 924                                 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
 925                         goto out;
 926                 }
 927
 928                 err = icmp_err_convert[code].errno;
 929                 break;
 930         case ICMP_TIME_EXCEEDED:
 931                 err = EHOSTUNREACH;
 932                 break;
 933         default:
 934                 goto out;
 935         }
 936
 937         switch (sk->state) {
 938                 struct open_request *req, **prev;
 939         case TCP_LISTEN:
 940                 if (sk->lock.users != 0)
 941                         goto out;
 942
 943                 /* The final ACK of the handshake should be already
 944                  * handled in the new socket context, not here.
 945                  * Strictly speaking - an ICMP error for the final
 946                  * ACK should set the opening flag, but that is too
 947                  * complicated right now.
 948                  */
 949                 if (!no_flags && !th->syn && !th->ack)
 950                         goto out;
 951
 952                 req = tcp_v4_search_req(tp, iph, th, &prev);
 953                 if (!req)
 954                         goto out;
 955
 956                 /* ICMPs are not backlogged, hence we cannot get
 957                    an established socket here.
 958                  */
 959                 BUG_TRAP(req->sk == NULL);
 960
 961                 if (seq != req->snt_isn) {
 962                         NET_INC_STATS_BH(OutOfWindowIcmps);
 963                         goto out;
 964                 }
 965
 966                 /*
 967                  * Still in SYN_RECV, just remove it silently.
 968                  * There is no good way to pass the error to the newly
 969                  * created socket, and POSIX does not want network
 970                  * errors returned from accept().
 971                  */
 972                 tcp_synq_drop(sk, req, prev);
 973                 goto out;
 974
 975         case TCP_SYN_SENT:
 976         case TCP_SYN_RECV:  /* Cannot happen.
 977                                It can f.e. if SYNs crossed.
 978                              */
 979                 if (!no_flags && !th->syn)
 980                         goto out;
 981                 if (sk->lock.users == 0) {
 982                         TCP_INC_STATS_BH(TcpAttemptFails);
 983                         sk->err = err;
 984
 985                         sk->error_report(sk);
 986
 987                         tcp_done(sk);
 988                 } else {
 989                         sk->err_soft = err;
 990                 }
 991                 goto out;
 992         }
 993
 994         /* If we've already connected we will keep trying
 995          * until we time out, or the user gives up.
 996          *
 997          * rfc1122 4.2.3.9 allows to consider as hard errors
 998          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 999          * but it is obsoleted by pmtu discovery).
1000          *
1001          * Note, that in modern internet, where routing is unreliable
1002          * and in each dark corner broken firewalls sit, sending random
1003          * errors ordered by their masters even this two messages finally lose
1004          * their original sense (even Linux sends invalid PORT_UNREACHs)
1005          *
1006          * Now we are in compliance with RFCs.
1007          *                                                      --ANK (980905)
1008          */
1009
1010         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1011                 sk->err = err;
1012                 sk->error_report(sk);
1013         } else  { /* Only an error on timeout */
1014                 sk->err_soft = err;
1015         }
1016
1017 out:
1018         bh_unlock_sock(sk);
1019         sock_put(sk);
1020 }
1021
1022 /* This routine computes an IPv4 TCP checksum. */
1023 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1024                        struct sk_buff *skb)
1025 {
1026         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1027                                  csum_partial((char *)th, th->doff<<2, skb->csum));
1028 }
1029
1030 /*
1031  *      This routine will send an RST to the other tcp.
1032  *
1033  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1034  *                    for reset.
1035  *      Answer: if a packet caused RST, it is not for a socket
1036  *              existing in our system, if it is matched to a socket,
1037  *              it is just duplicate segment or bug in other side's TCP.
1038  *              So that we build reply only basing on parameters
1039  *              arrived with segment.
1040  *      Exception: precedence violation. We do not implement it in any case.
1041  */
1042
1043 static void tcp_v4_send_reset(struct sk_buff *skb)
1044 {
1045         struct tcphdr *th = skb->h.th;
1046         struct tcphdr rth;
1047         struct ip_reply_arg arg;
1048
1049         /* Never send a reset in response to a reset. */
1050         if (th->rst)
1051                 return;
1052
1053         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1054                 return;
1055
1056         /* Swap the send and the receive. */
1057         memset(&rth, 0, sizeof(struct tcphdr));
1058         rth.dest = th->source;
1059         rth.source = th->dest;
1060         rth.doff = sizeof(struct tcphdr)/4;
1061         rth.rst = 1;
1062
1063         if (th->ack) {
1064                 rth.seq = th->ack_seq;
1065         } else {
1066                 rth.ack = 1;
1067                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1068                                     + skb->len - (th->doff<<2));
1069         }
1070
1071         memset(&arg, 0, sizeof arg);
1072         arg.iov[0].iov_base = (unsigned char *)&rth;
1073         arg.iov[0].iov_len  = sizeof rth;
1074         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1075                                       skb->nh.iph->saddr, /*XXX*/
1076                                       sizeof(struct tcphdr),
1077                                       IPPROTO_TCP,
1078                                       0);
1079         arg.n_iov = 1;
1080         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1081
1082         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1083
1084         TCP_INC_STATS_BH(TcpOutSegs);
1085         TCP_INC_STATS_BH(TcpOutRsts);
1086 }
1087
1088 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1089    outside socket context is ugly, certainly. What can I do?
1090  */
1091
1092 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1093 {
1094         struct tcphdr *th = skb->h.th;
1095         struct {
1096                 struct tcphdr th;
1097                 u32 tsopt[3];
1098         } rep;
1099         struct ip_reply_arg arg;
1100
1101         memset(&rep.th, 0, sizeof(struct tcphdr));
1102         memset(&arg, 0, sizeof arg);
1103
1104         arg.iov[0].iov_base = (unsigned char *)&rep;
1105         arg.iov[0].iov_len  = sizeof(rep.th);
1106         arg.n_iov = 1;
1107         if (ts) {
1108                 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1109                                                 (TCPOPT_NOP << 16) |
1110                                                 (TCPOPT_TIMESTAMP << 8) |
1111                                                 TCPOLEN_TIMESTAMP);
1112                 rep.tsopt[1] = htonl(tcp_time_stamp);
1113                 rep.tsopt[2] = htonl(ts);
1114                 arg.iov[0].iov_len = sizeof(rep);
1115         }
1116
1117         /* Swap the send and the receive. */
1118         rep.th.dest = th->source;
1119         rep.th.source = th->dest;
1120         rep.th.doff = arg.iov[0].iov_len/4;
1121         rep.th.seq = htonl(seq);
1122         rep.th.ack_seq = htonl(ack);
1123         rep.th.ack = 1;
1124         rep.th.window = htons(win);
1125
1126         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1127                                       skb->nh.iph->saddr, /*XXX*/
1128                                       arg.iov[0].iov_len,
1129                                       IPPROTO_TCP,
1130                                       0);
1131         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1132
1133         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1134
1135         TCP_INC_STATS_BH(TcpOutSegs);
1136 }
1137
1138 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1139 {
1140         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1141
1142         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1143                         tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1144
1145         tcp_tw_put(tw);
1146 }
1147
1148 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1149 {
1150         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1151                         req->ts_recent);
1152 }
1153
1154 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1155 {
1156         struct rtable *rt;
1157         struct ip_options *opt;
1158
1159         opt = req->af.v4_req.opt;
1160         if(ip_route_output(&rt, ((opt && opt->srr) ?
1161                                  opt->faddr :
1162                                  req->af.v4_req.rmt_addr),
1163                            req->af.v4_req.loc_addr,
1164                            RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1165                            sk->bound_dev_if)) {
1166                 IP_INC_STATS_BH(IpOutNoRoutes);
1167                 return NULL;
1168         }
1169         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1170                 ip_rt_put(rt);
1171                 IP_INC_STATS_BH(IpOutNoRoutes);
1172                 return NULL;
1173         }
1174         return &rt->u.dst;
1175 }
1176
1177 /*
1178  *      Send a SYN-ACK after having received an ACK.
1179  *      This still operates on a open_request only, not on a big
1180  *      socket.
1181  */
1182 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1183                               struct dst_entry *dst)
1184 {
1185         int err = -1;
1186         struct sk_buff * skb;
1187
1188         /* First, grab a route. */
1189         if (dst == NULL &&
1190             (dst = tcp_v4_route_req(sk, req)) == NULL)
1191                 goto out;
1192
1193         skb = tcp_make_synack(sk, dst, req);
1194
1195         if (skb) {
1196                 struct tcphdr *th = skb->h.th;
1197
1198                 th->check = tcp_v4_check(th, skb->len,
1199                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1200                                          csum_partial((char *)th, skb->len, skb->csum));
1201
1202                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1203                                             req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1204                 if (err == NET_XMIT_CN)
1205                         err = 0;
1206         }
1207
1208 out:
1209         dst_release(dst);
1210         return err;
1211 }
1212
1213 /*
1214  *      IPv4 open_request destructor.
1215  */
1216 static void tcp_v4_or_free(struct open_request *req)
1217 {
1218         if (req->af.v4_req.opt)
1219                 kfree(req->af.v4_req.opt);
1220 }
1221
1222 static inline void syn_flood_warning(struct sk_buff *skb)
1223 {
1224         static unsigned long warntime;
1225
1226         if (jiffies - warntime > HZ*60) {
1227                 warntime = jiffies;
1228                 printk(KERN_INFO
1229                        "possible SYN flooding on port %d. Sending cookies.\n",
1230                        ntohs(skb->h.th->dest));
1231         }
1232 }
1233
1234 /*
1235  * Save and compile IPv4 options into the open_request if needed.
1236  */
1237 static inline struct ip_options *
1238 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1239 {
1240         struct ip_options *opt = &(IPCB(skb)->opt);
1241         struct ip_options *dopt = NULL;
1242
1243         if (opt && opt->optlen) {
1244                 int opt_size = optlength(opt);
1245                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1246                 if (dopt) {
1247                         if (ip_options_echo(dopt, skb)) {
1248                                 kfree(dopt);
1249                                 dopt = NULL;
1250                         }
1251                 }
1252         }
1253         return dopt;
1254 }
1255
1256 /*
1257  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1258  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1259  * It would be better to replace it with a global counter for all sockets
1260  * but then some measure against one socket starving all other sockets
1261  * would be needed.
1262  *
1263  * It was 128 by default. Experiments with real servers show, that
1264  * it is absolutely not enough even at 100conn/sec. 256 cures most
1265  * of problems. This value is adjusted to 128 for very small machines
1266  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1267  * Further increasing requires to change hash table size.
1268  */
1269 int sysctl_max_syn_backlog = 256;
1270
1271 struct or_calltable or_ipv4 = {
1272         PF_INET,
1273         tcp_v4_send_synack,
1274         tcp_v4_or_send_ack,
1275         tcp_v4_or_free,
1276         tcp_v4_send_reset
1277 };
1278
1279 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1280 {
1281         struct tcp_opt tp;
1282         struct open_request *req;
1283         __u32 saddr = skb->nh.iph->saddr;
1284         __u32 daddr = skb->nh.iph->daddr;
1285         __u32 isn = TCP_SKB_CB(skb)->when;
1286         struct dst_entry *dst = NULL;
1287 #ifdef CONFIG_SYN_COOKIES
1288         int want_cookie = 0;
1289 #else
1290 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1291 #endif
1292
1293         /* Never answer to SYNs send to broadcast or multicast */
1294         if (((struct rtable *)skb->dst)->rt_flags &
1295             (RTCF_BROADCAST|RTCF_MULTICAST))
1296                 goto drop;
1297
1298         /* TW buckets are converted to open requests without
1299          * limitations, they conserve resources and peer is
1300          * evidently real one.
1301          */
1302         if (tcp_synq_is_full(sk) && !isn) {
1303 #ifdef CONFIG_SYN_COOKIES
1304                 if (sysctl_tcp_syncookies) {
1305                         want_cookie = 1;
1306                 } else
1307 #endif
1308                 goto drop;
1309         }
1310
1311         /* Accept backlog is full. If we have already queued enough
1312          * of warm entries in syn queue, drop request. It is better than
1313          * clogging syn queue with openreqs with exponentially increasing
1314          * timeout.
1315          */
1316         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1317                 goto drop;
1318
1319         req = tcp_openreq_alloc();
1320         if (req == NULL)
1321                 goto drop;
1322
1323         tcp_clear_options(&tp);
1324         tp.mss_clamp = 536;
1325         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1326
1327         tcp_parse_options(skb, &tp, 0);
1328
1329         if (want_cookie) {
1330                 tcp_clear_options(&tp);
1331                 tp.saw_tstamp = 0;
1332         }
1333
1334         if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1335                 /* Some OSes (unknown ones, but I see them on web server, which
1336                  * contains information interesting only for windows'
1337                  * users) do not send their stamp in SYN. It is easy case.
1338                  * We simply do not advertise TS support.
1339                  */
1340                 tp.saw_tstamp = 0;
1341                 tp.tstamp_ok = 0;
1342         }
1343         tp.tstamp_ok = tp.saw_tstamp;
1344
1345         tcp_openreq_init(req, &tp, skb);
1346
1347         req->af.v4_req.loc_addr = daddr;
1348         req->af.v4_req.rmt_addr = saddr;
1349         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1350         req->class = &or_ipv4;
1351         if (!want_cookie)
1352                 TCP_ECN_create_request(req, skb->h.th);
1353
1354         if (want_cookie) {
1355 #ifdef CONFIG_SYN_COOKIES
1356                 syn_flood_warning(skb);
1357 #endif
1358                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1359         } else if (isn == 0) {
1360                 struct inet_peer *peer = NULL;
1361
1362                 /* VJ's idea. We save last timestamp seen
1363                  * from the destination in peer table, when entering
1364                  * state TIME-WAIT, and check against it before
1365                  * accepting new connection request.
1366                  *
1367                  * If "isn" is not zero, this request hit alive
1368                  * timewait bucket, so that all the necessary checks
1369                  * are made in the function processing timewait state.
1370                  */
1371                 if (tp.saw_tstamp &&
1372                     sysctl_tcp_tw_recycle &&
1373                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1374                     (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1375                     peer->v4daddr == saddr) {
1376                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1377                             (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1378                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1379                                 dst_release(dst);
1380                                 goto drop_and_free;
1381                         }
1382                 }
1383                 /* Kill the following clause, if you dislike this way. */
1384                 else if (!sysctl_tcp_syncookies &&
1385                          (sysctl_max_syn_backlog - tcp_synq_len(sk)
1386                           < (sysctl_max_syn_backlog>>2)) &&
1387                          (!peer || !peer->tcp_ts_stamp) &&
1388                          (!dst || !dst->rtt)) {
1389                         /* Without syncookies last quarter of
1390                          * backlog is filled with destinations, proven to be alive.
1391                          * It means that we continue to communicate
1392                          * to destinations, already remembered
1393                          * to the moment of synflood.
1394                          */
1395                         NETDEBUG(if (net_ratelimit()) \
1396                                 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1397                                         NIPQUAD(saddr), ntohs(skb->h.th->source)));
1398                         TCP_INC_STATS_BH(TcpAttemptFails);
1399                         dst_release(dst);
1400                         goto drop_and_free;
1401                 }
1402
1403                 isn = tcp_v4_init_sequence(sk, skb);
1404         }
1405         req->snt_isn = isn;
1406
1407         if (tcp_v4_send_synack(sk, req, dst))
1408                 goto drop_and_free;
1409
1410         if (want_cookie) {
1411                 tcp_openreq_free(req);
1412         } else {
1413                 tcp_v4_synq_add(sk, req);
1414         }
1415         return 0;
1416
1417 drop_and_free:
1418         tcp_openreq_free(req);
1419 drop:
1420         TCP_INC_STATS_BH(TcpAttemptFails);
1421         return 0;
1422 }
1423
1424
1425 /*
1426  * The three way handshake has completed - we got a valid synack -
1427  * now create the new socket.
1428  */
1429 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1430                                    struct open_request *req,
1431                                    struct dst_entry *dst)
1432 {
1433         struct tcp_opt *newtp;
1434         struct sock *newsk;
1435
1436         if (tcp_acceptq_is_full(sk))
1437                 goto exit_overflow;
1438
1439         if (dst == NULL &&
1440             (dst = tcp_v4_route_req(sk, req)) == NULL)
1441                 goto exit;
1442
1443         newsk = tcp_create_openreq_child(sk, req, skb);
1444         if (!newsk)
1445                 goto exit;
1446
1447         newsk->dst_cache = dst;
1448
1449         newtp = &(newsk->tp_pinfo.af_tcp);
1450         newsk->daddr = req->af.v4_req.rmt_addr;
1451         newsk->saddr = req->af.v4_req.loc_addr;
1452         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1453         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1454         req->af.v4_req.opt = NULL;
1455         newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1456         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1457         newtp->ext_header_len = 0;
1458         if (newsk->protinfo.af_inet.opt)
1459                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1460
1461         tcp_sync_mss(newsk, dst->pmtu);
1462         newtp->advmss = dst->advmss;
1463         tcp_initialize_rcv_mss(newsk);
1464
1465         __tcp_v4_hash(newsk);
1466         __tcp_inherit_port(sk, newsk);
1467
1468         return newsk;
1469
1470 exit_overflow:
1471         NET_INC_STATS_BH(ListenOverflows);
1472 exit:
1473         NET_INC_STATS_BH(ListenDrops);
1474         dst_release(dst);
1475         return NULL;
1476 }
1477
1478 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1479 {
1480         struct open_request *req, **prev;
1481         struct tcphdr *th = skb->h.th;
1482         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1483         struct sock *nsk;
1484
1485         /* Find possible connection requests. */
1486         req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1487         if (req)
1488                 return tcp_check_req(sk, skb, req, prev);
1489
1490         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1491                                           th->source,
1492                                           skb->nh.iph->daddr,
1493                                           ntohs(th->dest),
1494                                           tcp_v4_iif(skb));
1495
1496         if (nsk) {
1497                 if (nsk->state != TCP_TIME_WAIT) {
1498                         bh_lock_sock(nsk);
1499                         return nsk;
1500                 }
1501                 tcp_tw_put((struct tcp_tw_bucket*)sk);
1502                 return NULL;
1503         }
1504
1505 #ifdef CONFIG_SYN_COOKIES
1506         if (!th->rst && !th->syn && th->ack)
1507                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1508 #endif
1509         return sk;
1510 }
1511
1512 static int tcp_v4_checksum_init(struct sk_buff *skb)
1513 {
1514         if (skb->ip_summed == CHECKSUM_HW) {
1515                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1516                                  skb->nh.iph->daddr,skb->csum)) {
1517                         NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1518                         return -1;
1519                 }
1520                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1521         } else {
1522                 if (skb->len <= 76) {
1523                         if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1524                                          skb->nh.iph->daddr,
1525                                          csum_partial((char *)skb->h.th, skb->len, 0)))
1526                                 return -1;
1527                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1528                 } else {
1529                         skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1530                                                   skb->nh.iph->daddr,0);
1531                 }
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547 #ifdef CONFIG_FILTER
1548         struct sk_filter *filter = sk->filter;
1549         if (filter && sk_filter(skb, filter))
1550                 goto discard;
1551 #endif /* CONFIG_FILTER */
1552
1553         IP_INC_STATS_BH(IpInDelivers);
1554
1555         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1556                 TCP_CHECK_TIMER(sk);
1557                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1558                         goto reset;
1559                 TCP_CHECK_TIMER(sk);
1560                 return 0;
1561         }
1562
1563         if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1564                 goto csum_err;
1565
1566         if (sk->state == TCP_LISTEN) {
1567                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1568                 if (!nsk)
1569                         goto discard;
1570
1571                 if (nsk != sk) {
1572                         if (tcp_child_process(sk, nsk, skb))
1573                                 goto reset;
1574                         return 0;
1575                 }
1576         }
1577
1578         TCP_CHECK_TIMER(sk);
1579         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1580                 goto reset;
1581         TCP_CHECK_TIMER(sk);
1582         return 0;
1583
1584 reset:
1585         tcp_v4_send_reset(skb);
1586 discard:
1587         kfree_skb(skb);
1588         /* Be careful here. If this function gets more complicated and
1589          * gcc suffers from register pressure on the x86, sk (in %ebx)
1590          * might be destroyed here. This current version compiles correctly,
1591          * but you have been warned.
1592          */
1593         return 0;
1594
1595 csum_err:
1596         TCP_INC_STATS_BH(TcpInErrs);
1597         goto discard;
1598 }
1599
1600 /*
1601  *      From tcp_input.c
1602  */
1603
1604 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1605 {
1606         struct tcphdr *th;
1607         struct sock *sk;
1608         int ret;
1609
1610         if (skb->pkt_type!=PACKET_HOST)
1611                 goto discard_it;
1612
1613         th = skb->h.th;
1614
1615         /* Pull up the IP header. */
1616         __skb_pull(skb, skb->h.raw - skb->data);
1617
1618         /* Count it even if it's bad */
1619         TCP_INC_STATS_BH(TcpInSegs);
1620
1621         /* An explanation is required here, I think.
1622          * Packet length and doff are validated by header prediction,
1623          * provided case of th->doff==0 is elimineted.
1624          * So, we defer the checks. */
1625         if (th->doff < sizeof(struct tcphdr)/4 ||
1626             (skb->ip_summed != CHECKSUM_UNNECESSARY &&
1627              tcp_v4_checksum_init(skb) < 0))
1628                 goto bad_packet;
1629
1630         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1631         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1632                                     len - th->doff*4);
1633         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1634         TCP_SKB_CB(skb)->when = 0;
1635         TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1636         TCP_SKB_CB(skb)->sacked = 0;
1637         skb->used = 0;
1638
1639         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1640                              skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1641
1642         if (!sk)
1643                 goto no_tcp_socket;
1644
1645 process:
1646         if(!ipsec_sk_policy(sk,skb))
1647                 goto discard_and_relse;
1648
1649         if (sk->state == TCP_TIME_WAIT)
1650                 goto do_time_wait;
1651
1652         skb->dev = NULL;
1653
1654         bh_lock_sock(sk);
1655         ret = 0;
1656         if (!sk->lock.users) {
1657                 if (!tcp_prequeue(sk, skb))
1658                         ret = tcp_v4_do_rcv(sk, skb);
1659         } else
1660                 sk_add_backlog(sk, skb);
1661         bh_unlock_sock(sk);
1662
1663         sock_put(sk);
1664
1665         return ret;
1666
1667 no_tcp_socket:
1668         if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1669 bad_packet:
1670                 TCP_INC_STATS_BH(TcpInErrs);
1671         } else {
1672                 tcp_v4_send_reset(skb);
1673         }
1674
1675 discard_it:
1676         /* Discard frame. */
1677         kfree_skb(skb);
1678         return 0;
1679
1680 discard_and_relse:
1681         sock_put(sk);
1682         goto discard_it;
1683
1684 do_time_wait:
1685         if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1686                 TCP_INC_STATS_BH(TcpInErrs);
1687                 goto discard_and_relse;
1688         }
1689         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1690                                           skb, th, skb->len)) {
1691         case TCP_TW_SYN:
1692         {
1693                 struct sock *sk2;
1694
1695                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1696                 if (sk2 != NULL) {
1697                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1698                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1699                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1700                         sk = sk2;
1701                         goto process;
1702                 }
1703                 /* Fall through to ACK */
1704         }
1705         case TCP_TW_ACK:
1706                 tcp_v4_timewait_ack(sk, skb);
1707                 break;
1708         case TCP_TW_RST:
1709                 goto no_tcp_socket;
1710         case TCP_TW_SUCCESS:
1711         }
1712         goto discard_it;
1713 }
1714
1715 /* With per-bucket locks this operation is not-atomic, so that
1716  * this version is not worse.
1717  */
1718 static void __tcp_v4_rehash(struct sock *sk)
1719 {
1720         sk->prot->unhash(sk);
1721         sk->prot->hash(sk);
1722 }
1723
1724 static int tcp_v4_reselect_saddr(struct sock *sk)
1725 {
1726         int err;
1727         struct rtable *rt;
1728         __u32 old_saddr = sk->saddr;
1729         __u32 new_saddr;
1730         __u32 daddr = sk->daddr;
1731
1732         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1733                 daddr = sk->protinfo.af_inet.opt->faddr;
1734
1735         /* Query new route. */
1736         err = ip_route_connect(&rt, daddr, 0,
1737                                RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1738                                sk->bound_dev_if);
1739         if (err)
1740                 return err;
1741
1742         __sk_dst_set(sk, &rt->u.dst);
1743         /* sk->route_caps = rt->u.dst.dev->features; */
1744
1745         new_saddr = rt->rt_src;
1746
1747         if (new_saddr == old_saddr)
1748                 return 0;
1749
1750         if (sysctl_ip_dynaddr > 1) {
1751                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1752                        "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1753                        NIPQUAD(old_saddr),
1754                        NIPQUAD(new_saddr));
1755         }
1756
1757         sk->saddr = new_saddr;
1758         sk->rcv_saddr = new_saddr;
1759
1760         /* XXX The only one ugly spot where we need to
1761          * XXX really change the sockets identity after
1762          * XXX it has entered the hashes. -DaveM
1763          *
1764          * Besides that, it does not check for connection
1765          * uniqueness. Wait for troubles.
1766          */
1767         __tcp_v4_rehash(sk);
1768         return 0;
1769 }
1770
1771 int tcp_v4_rebuild_header(struct sock *sk)
1772 {
1773         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1774         u32 daddr;
1775         int err;
1776
1777         /* Route is OK, nothing to do. */
1778         if (rt != NULL)
1779                 return 0;
1780
1781         /* Reroute. */
1782         daddr = sk->daddr;
1783         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1784                 daddr = sk->protinfo.af_inet.opt->faddr;
1785
1786         err = ip_route_output(&rt, daddr, sk->saddr,
1787                               RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1788                               sk->bound_dev_if);
1789         if (!err) {
1790                 __sk_dst_set(sk, &rt->u.dst);
1791                 /* sk->route_caps = rt->u.dst.dev->features; */
1792                 return 0;
1793         }
1794
1795         /* Routing failed... */
1796         /* sk->route_caps = 0; */
1797
1798         if (!sysctl_ip_dynaddr ||
1799             sk->state != TCP_SYN_SENT ||
1800             (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1801             (err = tcp_v4_reselect_saddr(sk)) != 0) {
1802                 sk->err_soft=-err;
1803                 /* sk->error_report(sk); */
1804         }
1805         return err;
1806 }
1807
1808 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1809 {
1810         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1811
1812         sin->sin_family         = AF_INET;
1813         sin->sin_addr.s_addr    = sk->daddr;
1814         sin->sin_port           = sk->dport;
1815 }
1816
1817 /* VJ's idea. Save last timestamp seen from this destination
1818  * and hold it at least for normal timewait interval to use for duplicate
1819  * segment detection in subsequent connections, before they enter synchronized
1820  * state.
1821  */
1822
1823 int tcp_v4_remember_stamp(struct sock *sk)
1824 {
1825         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1826         struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1827         struct inet_peer *peer = NULL;
1828         int release_it = 0;
1829
1830         if (rt == NULL || rt->rt_dst != sk->daddr) {
1831                 peer = inet_getpeer(sk->daddr, 1);
1832                 release_it = 1;
1833         } else {
1834                 if (rt->peer == NULL)
1835                         rt_bind_peer(rt, 1);
1836                 peer = rt->peer;
1837         }
1838
1839         if (peer) {
1840                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1841                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1842                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1843                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
1844                         peer->tcp_ts = tp->ts_recent;
1845                 }
1846                 if (release_it)
1847                         inet_putpeer(peer);
1848                 return 1;
1849         }
1850
1851         return 0;
1852 }
1853
1854 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1855 {
1856         struct inet_peer *peer = NULL;
1857
1858         peer = inet_getpeer(tw->daddr, 1);
1859
1860         if (peer) {
1861                 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1862                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1863                      peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1864                         peer->tcp_ts_stamp = tw->ts_recent_stamp;
1865                         peer->tcp_ts = tw->ts_recent;
1866                 }
1867                 inet_putpeer(peer);
1868                 return 1;
1869         }
1870
1871         return 0;
1872 }
1873
1874 struct tcp_func ipv4_specific = {
1875         ip_queue_xmit,
1876         tcp_v4_send_check,
1877         tcp_v4_rebuild_header,
1878         tcp_v4_conn_request,
1879         tcp_v4_syn_recv_sock,
1880         tcp_v4_hash_connecting,
1881         tcp_v4_remember_stamp,
1882         sizeof(struct iphdr),
1883
1884         ip_setsockopt,
1885         ip_getsockopt,
1886         v4_addr2sockaddr,
1887         sizeof(struct sockaddr_in)
1888 };
1889
1890 /* NOTE: A lot of things set to zero explicitly by call to
1891  *       sk_alloc() so need not be done here.
1892  */
1893 static int tcp_v4_init_sock(struct sock *sk)
1894 {
1895         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1896
1897         skb_queue_head_init(&tp->out_of_order_queue);
1898         tcp_init_xmit_timers(sk);
1899         tcp_prequeue_init(tp);
1900
1901         tp->rto  = TCP_TIMEOUT_INIT;
1902         tp->mdev = TCP_TIMEOUT_INIT;
1903
1904         /* So many TCP implementations out there (incorrectly) count the
1905          * initial SYN frame in their delayed-ACK and congestion control
1906          * algorithms that we must have the following bandaid to talk
1907          * efficiently to them.  -DaveM
1908          */
1909         tp->snd_cwnd = 2;
1910
1911         /* See draft-stevens-tcpca-spec-01 for discussion of the
1912          * initialization of these values.
1913          */
1914         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1915         tp->snd_cwnd_clamp = ~0;
1916         tp->mss_cache = 536;
1917
1918         tp->reordering = sysctl_tcp_reordering;
1919
1920         sk->state = TCP_CLOSE;
1921
1922         sk->write_space = tcp_write_space;
1923
1924         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1925
1926         sk->sndbuf = sysctl_tcp_wmem[1];
1927         sk->rcvbuf = sysctl_tcp_rmem[1];
1928
1929         atomic_inc(&tcp_sockets_allocated);
1930
1931         return 0;
1932 }
1933
1934 static int tcp_v4_destroy_sock(struct sock *sk)
1935 {
1936         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1937
1938         tcp_clear_xmit_timers(sk);
1939
1940         /* Cleanup up the write buffer. */
1941         tcp_writequeue_purge(sk);
1942
1943         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1944         __skb_queue_purge(&tp->out_of_order_queue);
1945
1946         /* Clean prequeue, it must be empty really */
1947         __skb_queue_purge(&tp->ucopy.prequeue);
1948
1949         /* Clean up a referenced TCP bind bucket. */
1950         if(sk->prev != NULL)
1951                 tcp_put_port(sk);
1952
1953         atomic_dec(&tcp_sockets_allocated);
1954
1955         return 0;
1956 }
1957
1958 /* Proc filesystem TCP sock list dumping. */
1959 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1960 {
1961         int ttd = req->expires - jiffies;
1962
1963         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1964                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1965                 i,
1966                 req->af.v4_req.loc_addr,
1967                 ntohs(sk->sport),
1968                 req->af.v4_req.rmt_addr,
1969                 ntohs(req->rmt_port),
1970                 TCP_SYN_RECV,
1971                 0,0, /* could print option size, but that is af dependent. */
1972                 1,   /* timers active (only the expire timer) */
1973                 ttd,
1974                 req->retrans,
1975                 uid,
1976                 0,  /* non standard timer */
1977                 0, /* open_requests have no inode */
1978                 atomic_read(&sk->refcnt),
1979                 req
1980                 );
1981 }
1982
1983 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1984 {
1985         unsigned int dest, src;
1986         __u16 destp, srcp;
1987         int timer_active;
1988         unsigned long timer_expires;
1989         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1990
1991         dest  = sp->daddr;
1992         src   = sp->rcv_saddr;
1993         destp = ntohs(sp->dport);
1994         srcp  = ntohs(sp->sport);
1995         if (tp->pending == TCP_TIME_RETRANS) {
1996                 timer_active    = 1;
1997                 timer_expires   = tp->timeout;
1998         } else if (tp->pending == TCP_TIME_PROBE0) {
1999                 timer_active    = 4;
2000                 timer_expires   = tp->timeout;
2001         } else if (timer_pending(&sp->timer)) {
2002                 timer_active    = 2;
2003                 timer_expires   = sp->timer.expires;
2004         } else {
2005                 timer_active    = 0;
2006                 timer_expires = jiffies;
2007         }
2008
2009         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2010                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2011                 i, src, srcp, dest, destp, sp->state,
2012                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2013                 timer_active, timer_expires-jiffies,
2014                 tp->retransmits,
2015                 sock_i_uid(sp),
2016                 tp->probes_out,
2017                 sock_i_ino(sp),
2018                 atomic_read(&sp->refcnt), sp,
2019                 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2020                 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2021                 );
2022 }
2023
2024 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2025 {
2026         unsigned int dest, src;
2027         __u16 destp, srcp;
2028         int ttd = tw->ttd - jiffies;
2029
2030         if (ttd < 0)
2031                 ttd = 0;
2032
2033         dest  = tw->daddr;
2034         src   = tw->rcv_saddr;
2035         destp = ntohs(tw->dport);
2036         srcp  = ntohs(tw->sport);
2037
2038         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2039                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2040                 i, src, srcp, dest, destp, tw->substate, 0, 0,
2041                 3, ttd, 0, 0, 0, 0,
2042                 atomic_read(&tw->refcnt), tw);
2043 }
2044
2045 #define TMPSZ 150
2046
2047 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2048 {
2049         int len = 0, num = 0, i;
2050         off_t begin, pos = 0;
2051         char tmpbuf[TMPSZ+1];
2052
2053         if (offset < TMPSZ)
2054                 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2055                                "  sl  local_address rem_address   st tx_queue "
2056                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2057
2058         pos = TMPSZ;
2059
2060         /* First, walk listening socket table. */
2061         tcp_listen_lock();
2062         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2063                 struct sock *sk = tcp_listening_hash[i];
2064                 struct tcp_listen_opt *lopt;
2065                 int k;
2066
2067                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2068                         struct open_request *req;
2069                         int uid;
2070                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2071
2072                         if (!TCP_INET_FAMILY(sk->family))
2073                                 goto skip_listen;
2074
2075                         pos += TMPSZ;
2076                         if (pos >= offset) {
2077                                 get_tcp_sock(sk, tmpbuf, num);
2078                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2079                                 if (len >= length) {
2080                                         tcp_listen_unlock();
2081                                         goto out_no_bh;
2082                                 }
2083                         }
2084
2085 skip_listen:
2086                         uid = sock_i_uid(sk);
2087                         read_lock_bh(&tp->syn_wait_lock);
2088                         lopt = tp->listen_opt;
2089                         if (lopt && lopt->qlen != 0) {
2090                                 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2091                                         for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2092                                                 if (!TCP_INET_FAMILY(req->class->family))
2093                                                         continue;
2094
2095                                                 pos += TMPSZ;
2096                                                 if (pos <= offset)
2097                                                         continue;
2098                                                 get_openreq(sk, req, tmpbuf, num, uid);
2099                                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2100                                                 if(len >= length) {
2101                                                         read_unlock_bh(&tp->syn_wait_lock);
2102                                                         tcp_listen_unlock();
2103                                                         goto out_no_bh;
2104                                                 }
2105                                         }
2106                                 }
2107                         }
2108                         read_unlock_bh(&tp->syn_wait_lock);
2109
2110                         /* Completed requests are in normal socket hash table */
2111                 }
2112         }
2113         tcp_listen_unlock();
2114
2115         local_bh_disable();
2116
2117         /* Next, walk established hash chain. */
2118         for (i = 0; i < tcp_ehash_size; i++) {
2119                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2120                 struct sock *sk;
2121                 struct tcp_tw_bucket *tw;
2122
2123                 read_lock(&head->lock);
2124                 for(sk = head->chain; sk; sk = sk->next, num++) {
2125                         if (!TCP_INET_FAMILY(sk->family))
2126                                 continue;
2127                         pos += TMPSZ;
2128                         if (pos <= offset)
2129                                 continue;
2130                         get_tcp_sock(sk, tmpbuf, num);
2131                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2132                         if(len >= length) {
2133                                 read_unlock(&head->lock);
2134                                 goto out;
2135                         }
2136                 }
2137                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2138                      tw != NULL;
2139                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2140                         if (!TCP_INET_FAMILY(tw->family))
2141                                 continue;
2142                         pos += TMPSZ;
2143                         if (pos <= offset)
2144                                 continue;
2145                         get_timewait_sock(tw, tmpbuf, num);
2146                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2147                         if(len >= length) {
2148                                 read_unlock(&head->lock);
2149                                 goto out;
2150                         }
2151                 }
2152                 read_unlock(&head->lock);
2153         }
2154
2155 out:
2156         local_bh_enable();
2157 out_no_bh:
2158
2159         begin = len - (pos - offset);
2160         *start = buffer + begin;
2161         len -= begin;
2162         if(len > length)
2163                 len = length;
2164         if (len < 0)
2165                 len = 0;
2166         return len;
2167 }
2168
2169 struct proto tcp_prot = {
2170         name:           "TCP",
2171         close:          tcp_close,
2172         connect:        tcp_v4_connect,
2173         disconnect:     tcp_disconnect,
2174         accept:         tcp_accept,
2175         ioctl:          tcp_ioctl,
2176         init:           tcp_v4_init_sock,
2177         destroy:        tcp_v4_destroy_sock,
2178         shutdown:       tcp_shutdown,
2179         setsockopt:     tcp_setsockopt,
2180         getsockopt:     tcp_getsockopt,
2181         sendmsg:        tcp_sendmsg,
2182         recvmsg:        tcp_recvmsg,
2183         backlog_rcv:    tcp_v4_do_rcv,
2184         hash:           tcp_v4_hash,
2185         unhash:         tcp_unhash,
2186         get_port:       tcp_v4_get_port,
2187 };
2188
2189
2190
2191 void __init tcp_v4_init(struct net_proto_family *ops)
2192 {
2193         int err;
2194
2195         tcp_inode.i_mode = S_IFSOCK;
2196         tcp_inode.i_sock = 1;
2197         tcp_inode.i_uid = 0;
2198         tcp_inode.i_gid = 0;
2199         init_waitqueue_head(&tcp_inode.i_wait);
2200         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2201
2202         tcp_socket->inode = &tcp_inode;
2203         tcp_socket->state = SS_UNCONNECTED;
2204         tcp_socket->type=SOCK_RAW;
2205
2206         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2207                 panic("Failed to create the TCP control socket.\n");
2208         tcp_socket->sk->allocation=GFP_ATOMIC;
2209         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2210
2211         /* Unhash it so that IP input processing does not even
2212          * see it, we do not wish this socket to see incoming
2213          * packets.
2214          */
2215         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2216 }