net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_sock *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 if (tcp_port_rover < low)
 226                         rover = low;
 227                 else
 228                         rover = tcp_port_rover;
 229                 do {
 230                         rover++;
 231                         if (rover > high)
 232                                 rover = low;
 233                         head = &tcp_bhash[tcp_bhashfn(rover)];
 234                         spin_lock(&head->lock);
 235                         tb_for_each(tb, node, &head->chain)
 236                                 if (tb->port == rover)
 237                                         goto next;
 238                         break;
 239                 next:
 240                         spin_unlock(&head->lock);
 241                 } while (--remaining > 0);
 242                 tcp_port_rover = rover;
 243                 spin_unlock(&tcp_portalloc_lock);
 244
 245                 /* Exhausted local port range during search? */
 246                 ret = 1;
 247                 if (remaining <= 0)
 248                         goto fail;
 249
 250                 /* OK, here is the one we will use.  HEAD is
 251                  * non-NULL and we hold it's mutex.
 252                  */
 253                 snum = rover;
 254         } else {
 255                 head = &tcp_bhash[tcp_bhashfn(snum)];
 256                 spin_lock(&head->lock);
 257                 tb_for_each(tb, node, &head->chain)
 258                         if (tb->port == snum)
 259                                 goto tb_found;
 260         }
 261         tb = NULL;
 262         goto tb_not_found;
 263 tb_found:
 264         if (!hlist_empty(&tb->owners)) {
 265                 if (sk->sk_reuse > 1)
 266                         goto success;
 267                 if (tb->fastreuse > 0 &&
 268                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 269                         goto success;
 270                 } else {
 271                         ret = 1;
 272                         if (tcp_bind_conflict(sk, tb))
 273                                 goto fail_unlock;
 274                 }
 275         }
 276 tb_not_found:
 277         ret = 1;
 278         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 279                 goto fail_unlock;
 280         if (hlist_empty(&tb->owners)) {
 281                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 282                         tb->fastreuse = 1;
 283                 else
 284                         tb->fastreuse = 0;
 285         } else if (tb->fastreuse &&
 286                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 287                 tb->fastreuse = 0;
 288 success:
 289         if (!tcp_sk(sk)->bind_hash)
 290                 tcp_bind_hash(sk, tb, snum);
 291         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 292         ret = 0;
 293
 294 fail_unlock:
 295         spin_unlock(&head->lock);
 296 fail:
 297         local_bh_enable();
 298         return ret;
 299 }
 300
 301 /* Get rid of any references to a local port held by the
 302  * given sock.
 303  */
 304 static void __tcp_put_port(struct sock *sk)
 305 {
 306         struct inet_sock *inet = inet_sk(sk);
 307         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 308         struct tcp_bind_bucket *tb;
 309
 310         spin_lock(&head->lock);
 311         tb = tcp_sk(sk)->bind_hash;
 312         __sk_del_bind_node(sk);
 313         tcp_sk(sk)->bind_hash = NULL;
 314         inet->num = 0;
 315         tcp_bucket_destroy(tb);
 316         spin_unlock(&head->lock);
 317 }
 318
 319 void tcp_put_port(struct sock *sk)
 320 {
 321         local_bh_disable();
 322         __tcp_put_port(sk);
 323         local_bh_enable();
 324 }
 325
 326 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 327  * Look, when several writers sleep and reader wakes them up, all but one
 328  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 329  * this, _but_ remember, it adds useless work on UP machines (wake up each
 330  * exclusive lock release). It should be ifdefed really.
 331  */
 332
 333 void tcp_listen_wlock(void)
 334 {
 335         write_lock(&tcp_lhash_lock);
 336
 337         if (atomic_read(&tcp_lhash_users)) {
 338                 DEFINE_WAIT(wait);
 339
 340                 for (;;) {
 341                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 342                                                 &wait, TASK_UNINTERRUPTIBLE);
 343                         if (!atomic_read(&tcp_lhash_users))
 344                                 break;
 345                         write_unlock_bh(&tcp_lhash_lock);
 346                         schedule();
 347                         write_lock_bh(&tcp_lhash_lock);
 348                 }
 349
 350                 finish_wait(&tcp_lhash_wait, &wait);
 351         }
 352 }
 353
 354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 355 {
 356         struct hlist_head *list;
 357         rwlock_t *lock;
 358
 359         BUG_TRAP(sk_unhashed(sk));
 360         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 361                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 362                 lock = &tcp_lhash_lock;
 363                 tcp_listen_wlock();
 364         } else {
 365                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 366                 lock = &tcp_ehash[sk->sk_hashent].lock;
 367                 write_lock(lock);
 368         }
 369         __sk_add_node(sk, list);
 370         sock_prot_inc_use(sk->sk_prot);
 371         write_unlock(lock);
 372         if (listen_possible && sk->sk_state == TCP_LISTEN)
 373                 wake_up(&tcp_lhash_wait);
 374 }
 375
 376 static void tcp_v4_hash(struct sock *sk)
 377 {
 378         if (sk->sk_state != TCP_CLOSE) {
 379                 local_bh_disable();
 380                 __tcp_v4_hash(sk, 1);
 381                 local_bh_enable();
 382         }
 383 }
 384
 385 void tcp_unhash(struct sock *sk)
 386 {
 387         rwlock_t *lock;
 388
 389         if (sk_unhashed(sk))
 390                 goto ende;
 391
 392         if (sk->sk_state == TCP_LISTEN) {
 393                 local_bh_disable();
 394                 tcp_listen_wlock();
 395                 lock = &tcp_lhash_lock;
 396         } else {
 397                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 398                 lock = &head->lock;
 399                 write_lock_bh(&head->lock);
 400         }
 401
 402         if (__sk_del_node_init(sk))
 403                 sock_prot_dec_use(sk->sk_prot);
 404         write_unlock_bh(lock);
 405
 406  ende:
 407         if (sk->sk_state == TCP_LISTEN)
 408                 wake_up(&tcp_lhash_wait);
 409 }
 410
 411 /* Don't inline this cruft.  Here are some nice properties to
 412  * exploit here.  The BSD API does not allow a listening TCP
 413  * to specify the remote port nor the remote address for the
 414  * connection.  So always assume those are both wildcarded
 415  * during the search since they can never be otherwise.
 416  */
 417 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 418                                              unsigned short hnum, int dif)
 419 {
 420         struct sock *result = NULL, *sk;
 421         struct hlist_node *node;
 422         int score, hiscore;
 423
 424         hiscore=-1;
 425         sk_for_each(sk, node, head) {
 426                 struct inet_sock *inet = inet_sk(sk);
 427
 428                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 429                         __u32 rcv_saddr = inet->rcv_saddr;
 430
 431                         score = (sk->sk_family == PF_INET ? 1 : 0);
 432                         if (rcv_saddr) {
 433                                 if (rcv_saddr != daddr)
 434                                         continue;
 435                                 score+=2;
 436                         }
 437                         if (sk->sk_bound_dev_if) {
 438                                 if (sk->sk_bound_dev_if != dif)
 439                                         continue;
 440                                 score+=2;
 441                         }
 442                         if (score == 5)
 443                                 return sk;
 444                         if (score > hiscore) {
 445                                 hiscore = score;
 446                                 result = sk;
 447                         }
 448                 }
 449         }
 450         return result;
 451 }
 452
 453 /* Optimize the common listener case. */
 454 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 455                 unsigned short hnum, int dif)
 456 {
 457         struct sock *sk = NULL;
 458         struct hlist_head *head;
 459
 460         read_lock(&tcp_lhash_lock);
 461         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 462         if (!hlist_empty(head)) {
 463                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 464
 465                 if (inet->num == hnum && !sk->sk_node.next &&
 466                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 467                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 468                     !sk->sk_bound_dev_if)
 469                         goto sherry_cache;
 470                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 471         }
 472         if (sk) {
 473 sherry_cache:
 474                 sock_hold(sk);
 475         }
 476         read_unlock(&tcp_lhash_lock);
 477         return sk;
 478 }
 479
 480 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 481  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 482  *
 483  * Local BH must be disabled here.
 484  */
 485
 486 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 487                                                        u32 daddr, u16 hnum,
 488                                                        int dif)
 489 {
 490         struct tcp_ehash_bucket *head;
 491         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 492         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 493         struct sock *sk;
 494         struct hlist_node *node;
 495         /* Optimize here for direct hit, only listening connections can
 496          * have wildcards anyways.
 497          */
 498         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 499         head = &tcp_ehash[hash];
 500         read_lock(&head->lock);
 501         sk_for_each(sk, node, &head->chain) {
 502                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 503                         goto hit; /* You sunk my battleship! */
 504         }
 505
 506         /* Must check for a TIME_WAIT'er before going to listener hash. */
 507         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 508                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 509                         goto hit;
 510         }
 511         sk = NULL;
 512 out:
 513         read_unlock(&head->lock);
 514         return sk;
 515 hit:
 516         sock_hold(sk);
 517         goto out;
 518 }
 519
 520 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 521                                            u32 daddr, u16 hnum, int dif)
 522 {
 523         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 524                                                       daddr, hnum, dif);
 525
 526         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 527 }
 528
 529 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 530                                   u16 dport, int dif)
 531 {
 532         struct sock *sk;
 533
 534         local_bh_disable();
 535         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 536         local_bh_enable();
 537
 538         return sk;
 539 }
 540
 541 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 542
 543 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 544 {
 545         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 546                                           skb->nh.iph->saddr,
 547                                           skb->h.th->dest,
 548                                           skb->h.th->source);
 549 }
 550
 551 /* called with local bh disabled */
 552 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 553                                       struct tcp_tw_bucket **twp)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556         u32 daddr = inet->rcv_saddr;
 557         u32 saddr = inet->daddr;
 558         int dif = sk->sk_bound_dev_if;
 559         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 560         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 561         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 562         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 563         struct sock *sk2;
 564         struct hlist_node *node;
 565         struct tcp_tw_bucket *tw;
 566
 567         write_lock(&head->lock);
 568
 569         /* Check TIME-WAIT sockets first. */
 570         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 571                 tw = (struct tcp_tw_bucket *)sk2;
 572
 573                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 574                         struct tcp_sock *tp = tcp_sk(sk);
 575
 576                         /* With PAWS, it is safe from the viewpoint
 577                            of data integrity. Even without PAWS it
 578                            is safe provided sequence spaces do not
 579                            overlap i.e. at data rates <= 80Mbit/sec.
 580
 581                            Actually, the idea is close to VJ's one,
 582                            only timestamp cache is held not per host,
 583                            but per port pair and TW bucket is used
 584                            as state holder.
 585
 586                            If TW bucket has been already destroyed we
 587                            fall back to VJ's scheme and use initial
 588                            timestamp retrieved from peer table.
 589                          */
 590                         if (tw->tw_ts_recent_stamp &&
 591                             (!twp || (sysctl_tcp_tw_reuse &&
 592                                       xtime.tv_sec -
 593                                       tw->tw_ts_recent_stamp > 1))) {
 594                                 if ((tp->write_seq =
 595                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 596                                         tp->write_seq = 1;
 597                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 598                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 599                                 sock_hold(sk2);
 600                                 goto unique;
 601                         } else
 602                                 goto not_unique;
 603                 }
 604         }
 605         tw = NULL;
 606
 607         /* And established part... */
 608         sk_for_each(sk2, node, &head->chain) {
 609                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 610                         goto not_unique;
 611         }
 612
 613 unique:
 614         /* Must record num and sport now. Otherwise we will see
 615          * in hash table socket with a funny identity. */
 616         inet->num = lport;
 617         inet->sport = htons(lport);
 618         sk->sk_hashent = hash;
 619         BUG_TRAP(sk_unhashed(sk));
 620         __sk_add_node(sk, &head->chain);
 621         sock_prot_inc_use(sk->sk_prot);
 622         write_unlock(&head->lock);
 623
 624         if (twp) {
 625                 *twp = tw;
 626                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 627         } else if (tw) {
 628                 /* Silly. Should hash-dance instead... */
 629                 tcp_tw_deschedule(tw);
 630                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 631
 632                 tcp_tw_put(tw);
 633         }
 634
 635         return 0;
 636
 637 not_unique:
 638         write_unlock(&head->lock);
 639         return -EADDRNOTAVAIL;
 640 }
 641
 642 static inline u32 connect_port_offset(const struct sock *sk)
 643 {
 644         const struct inet_sock *inet = inet_sk(sk);
 645
 646         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 647                                          inet->dport);
 648 }
 649
 650 /*
 651  * Bind a port for a connect operation and hash it.
 652  */
 653 static inline int tcp_v4_hash_connect(struct sock *sk)
 654 {
 655         unsigned short snum = inet_sk(sk)->num;
 656         struct tcp_bind_hashbucket *head;
 657         struct tcp_bind_bucket *tb;
 658         int ret;
 659
 660         if (!snum) {
 661                 int low = sysctl_local_port_range[0];
 662                 int high = sysctl_local_port_range[1];
 663                 int range = high - low;
 664                 int i;
 665                 int port;
 666                 static u32 hint;
 667                 u32 offset = hint + connect_port_offset(sk);
 668                 struct hlist_node *node;
 669                 struct tcp_tw_bucket *tw = NULL;
 670
 671                 local_bh_disable();
 672                 for (i = 1; i <= range; i++) {
 673                         port = low + (i + offset) % range;
 674                         head = &tcp_bhash[tcp_bhashfn(port)];
 675                         spin_lock(&head->lock);
 676
 677                         /* Does not bother with rcv_saddr checks,
 678                          * because the established check is already
 679                          * unique enough.
 680                          */
 681                         tb_for_each(tb, node, &head->chain) {
 682                                 if (tb->port == port) {
 683                                         BUG_TRAP(!hlist_empty(&tb->owners));
 684                                         if (tb->fastreuse >= 0)
 685                                                 goto next_port;
 686                                         if (!__tcp_v4_check_established(sk,
 687                                                                         port,
 688                                                                         &tw))
 689                                                 goto ok;
 690                                         goto next_port;
 691                                 }
 692                         }
 693
 694                         tb = tcp_bucket_create(head, port);
 695                         if (!tb) {
 696                                 spin_unlock(&head->lock);
 697                                 break;
 698                         }
 699                         tb->fastreuse = -1;
 700                         goto ok;
 701
 702                 next_port:
 703                         spin_unlock(&head->lock);
 704                 }
 705                 local_bh_enable();
 706
 707                 return -EADDRNOTAVAIL;
 708
 709 ok:
 710                 hint += i;
 711
 712                 /* Head lock still held and bh's disabled */
 713                 tcp_bind_hash(sk, tb, port);
 714                 if (sk_unhashed(sk)) {
 715                         inet_sk(sk)->sport = htons(port);
 716                         __tcp_v4_hash(sk, 0);
 717                 }
 718                 spin_unlock(&head->lock);
 719
 720                 if (tw) {
 721                         tcp_tw_deschedule(tw);
 722                         tcp_tw_put(tw);
 723                 }
 724
 725                 ret = 0;
 726                 goto out;
 727         }
 728
 729         head  = &tcp_bhash[tcp_bhashfn(snum)];
 730         tb  = tcp_sk(sk)->bind_hash;
 731         spin_lock_bh(&head->lock);
 732         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 733                 __tcp_v4_hash(sk, 0);
 734                 spin_unlock_bh(&head->lock);
 735                 return 0;
 736         } else {
 737                 spin_unlock(&head->lock);
 738                 /* No definite answer... Walk to established hash table */
 739                 ret = __tcp_v4_check_established(sk, snum, NULL);
 740 out:
 741                 local_bh_enable();
 742                 return ret;
 743         }
 744 }
 745
 746 /* This will initiate an outgoing connection. */
 747 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 748 {
 749         struct inet_sock *inet = inet_sk(sk);
 750         struct tcp_sock *tp = tcp_sk(sk);
 751         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 752         struct rtable *rt;
 753         u32 daddr, nexthop;
 754         int tmp;
 755         int err;
 756
 757         if (addr_len < sizeof(struct sockaddr_in))
 758                 return -EINVAL;
 759
 760         if (usin->sin_family != AF_INET)
 761                 return -EAFNOSUPPORT;
 762
 763         nexthop = daddr = usin->sin_addr.s_addr;
 764         if (inet->opt && inet->opt->srr) {
 765                 if (!daddr)
 766                         return -EINVAL;
 767                 nexthop = inet->opt->faddr;
 768         }
 769
 770         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 771                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 772                                IPPROTO_TCP,
 773                                inet->sport, usin->sin_port, sk);
 774         if (tmp < 0)
 775                 return tmp;
 776
 777         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 778                 ip_rt_put(rt);
 779                 return -ENETUNREACH;
 780         }
 781
 782         if (!inet->opt || !inet->opt->srr)
 783                 daddr = rt->rt_dst;
 784
 785         if (!inet->saddr)
 786                 inet->saddr = rt->rt_src;
 787         inet->rcv_saddr = inet->saddr;
 788
 789         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 790                 /* Reset inherited state */
 791                 tp->rx_opt.ts_recent       = 0;
 792                 tp->rx_opt.ts_recent_stamp = 0;
 793                 tp->write_seq              = 0;
 794         }
 795
 796         if (sysctl_tcp_tw_recycle &&
 797             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 798                 struct inet_peer *peer = rt_get_peer(rt);
 799
 800                 /* VJ's idea. We save last timestamp seen from
 801                  * the destination in peer table, when entering state TIME-WAIT
 802                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 803                  */
 804
 805                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 806                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 807                         tp->rx_opt.ts_recent = peer->tcp_ts;
 808                 }
 809         }
 810
 811         inet->dport = usin->sin_port;
 812         inet->daddr = daddr;
 813
 814         tp->ext_header_len = 0;
 815         if (inet->opt)
 816                 tp->ext_header_len = inet->opt->optlen;
 817
 818         tp->rx_opt.mss_clamp = 536;
 819
 820         /* Socket identity is still unknown (sport may be zero).
 821          * However we set state to SYN-SENT and not releasing socket
 822          * lock select source port, enter ourselves into the hash tables and
 823          * complete initialization after this.
 824          */
 825         tcp_set_state(sk, TCP_SYN_SENT);
 826         err = tcp_v4_hash_connect(sk);
 827         if (err)
 828                 goto failure;
 829
 830         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 831         if (err)
 832                 goto failure;
 833
 834         /* OK, now commit destination to socket.  */
 835         __sk_dst_set(sk, &rt->u.dst);
 836         tcp_v4_setup_caps(sk, &rt->u.dst);
 837
 838         if (!tp->write_seq)
 839                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 840                                                            inet->daddr,
 841                                                            inet->sport,
 842                                                            usin->sin_port);
 843
 844         inet->id = tp->write_seq ^ jiffies;
 845
 846         err = tcp_connect(sk);
 847         rt = NULL;
 848         if (err)
 849                 goto failure;
 850
 851         return 0;
 852
 853 failure:
 854         /* This unhashes the socket and releases the local port, if necessary. */
 855         tcp_set_state(sk, TCP_CLOSE);
 856         ip_rt_put(rt);
 857         sk->sk_route_caps = 0;
 858         inet->dport = 0;
 859         return err;
 860 }
 861
 862 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 863 {
 864         return ((struct rtable *)skb->dst)->rt_iif;
 865 }
 866
 867 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 868 {
 869         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 870 }
 871
 872 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 873                                               struct request_sock ***prevp,
 874                                               __u16 rport,
 875                                               __u32 raddr, __u32 laddr)
 876 {
 877         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 878         struct request_sock *req, **prev;
 879
 880         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 881              (req = *prev) != NULL;
 882              prev = &req->dl_next) {
 883                 const struct inet_request_sock *ireq = inet_rsk(req);
 884
 885                 if (ireq->rmt_port == rport &&
 886                     ireq->rmt_addr == raddr &&
 887                     ireq->loc_addr == laddr &&
 888                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 889                         BUG_TRAP(!req->sk);
 890                         *prevp = prev;
 891                         break;
 892                 }
 893         }
 894
 895         return req;
 896 }
 897
 898 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 899 {
 900         struct tcp_sock *tp = tcp_sk(sk);
 901         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 902         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 903
 904         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 905         tcp_synq_added(sk);
 906 }
 907
 908
 909 /*
 910  * This routine does path mtu discovery as defined in RFC1191.
 911  */
 912 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 913                                      u32 mtu)
 914 {
 915         struct dst_entry *dst;
 916         struct inet_sock *inet = inet_sk(sk);
 917         struct tcp_sock *tp = tcp_sk(sk);
 918
 919         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 920          * send out by Linux are always <576bytes so they should go through
 921          * unfragmented).
 922          */
 923         if (sk->sk_state == TCP_LISTEN)
 924                 return;
 925
 926         /* We don't check in the destentry if pmtu discovery is forbidden
 927          * on this route. We just assume that no packet_to_big packets
 928          * are send back when pmtu discovery is not active.
 929          * There is a small race when the user changes this flag in the
 930          * route, but I think that's acceptable.
 931          */
 932         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 933                 return;
 934
 935         dst->ops->update_pmtu(dst, mtu);
 936
 937         /* Something is about to be wrong... Remember soft error
 938          * for the case, if this connection will not able to recover.
 939          */
 940         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 941                 sk->sk_err_soft = EMSGSIZE;
 942
 943         mtu = dst_mtu(dst);
 944
 945         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 946             tp->pmtu_cookie > mtu) {
 947                 tcp_sync_mss(sk, mtu);
 948
 949                 /* Resend the TCP packet because it's
 950                  * clear that the old packet has been
 951                  * dropped. This is the new "fast" path mtu
 952                  * discovery.
 953                  */
 954                 tcp_simple_retransmit(sk);
 955         } /* else let the usual retransmit timer handle it */
 956 }
 957
 958 /*
 959  * This routine is called by the ICMP module when it gets some
 960  * sort of error condition.  If err < 0 then the socket should
 961  * be closed and the error returned to the user.  If err > 0
 962  * it's just the icmp type << 8 | icmp code.  After adjustment
 963  * header points to the first 8 bytes of the tcp header.  We need
 964  * to find the appropriate port.
 965  *
 966  * The locking strategy used here is very "optimistic". When
 967  * someone else accesses the socket the ICMP is just dropped
 968  * and for some paths there is no check at all.
 969  * A more general error queue to queue errors for later handling
 970  * is probably better.
 971  *
 972  */
 973
 974 void tcp_v4_err(struct sk_buff *skb, u32 info)
 975 {
 976         struct iphdr *iph = (struct iphdr *)skb->data;
 977         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 978         struct tcp_sock *tp;
 979         struct inet_sock *inet;
 980         int type = skb->h.icmph->type;
 981         int code = skb->h.icmph->code;
 982         struct sock *sk;
 983         __u32 seq;
 984         int err;
 985
 986         if (skb->len < (iph->ihl << 2) + 8) {
 987                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 988                 return;
 989         }
 990
 991         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 992                            th->source, tcp_v4_iif(skb));
 993         if (!sk) {
 994                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 995                 return;
 996         }
 997         if (sk->sk_state == TCP_TIME_WAIT) {
 998                 tcp_tw_put((struct tcp_tw_bucket *)sk);
 999                 return;
1000         }
1001
1002         bh_lock_sock(sk);
1003         /* If too many ICMPs get dropped on busy
1004          * servers this needs to be solved differently.
1005          */
1006         if (sock_owned_by_user(sk))
1007                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1008
1009         if (sk->sk_state == TCP_CLOSE)
1010                 goto out;
1011
1012         tp = tcp_sk(sk);
1013         seq = ntohl(th->seq);
1014         if (sk->sk_state != TCP_LISTEN &&
1015             !between(seq, tp->snd_una, tp->snd_nxt)) {
1016                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1017                 goto out;
1018         }
1019
1020         switch (type) {
1021         case ICMP_SOURCE_QUENCH:
1022                 /* Just silently ignore these. */
1023                 goto out;
1024         case ICMP_PARAMETERPROB:
1025                 err = EPROTO;
1026                 break;
1027         case ICMP_DEST_UNREACH:
1028                 if (code > NR_ICMP_UNREACH)
1029                         goto out;
1030
1031                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1032                         if (!sock_owned_by_user(sk))
1033                                 do_pmtu_discovery(sk, iph, info);
1034                         goto out;
1035                 }
1036
1037                 err = icmp_err_convert[code].errno;
1038                 break;
1039         case ICMP_TIME_EXCEEDED:
1040                 err = EHOSTUNREACH;
1041                 break;
1042         default:
1043                 goto out;
1044         }
1045
1046         switch (sk->sk_state) {
1047                 struct request_sock *req, **prev;
1048         case TCP_LISTEN:
1049                 if (sock_owned_by_user(sk))
1050                         goto out;
1051
1052                 req = tcp_v4_search_req(tp, &prev, th->dest,
1053                                         iph->daddr, iph->saddr);
1054                 if (!req)
1055                         goto out;
1056
1057                 /* ICMPs are not backlogged, hence we cannot get
1058                    an established socket here.
1059                  */
1060                 BUG_TRAP(!req->sk);
1061
1062                 if (seq != tcp_rsk(req)->snt_isn) {
1063                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1064                         goto out;
1065                 }
1066
1067                 /*
1068                  * Still in SYN_RECV, just remove it silently.
1069                  * There is no good way to pass the error to the newly
1070                  * created socket, and POSIX does not want network
1071                  * errors returned from accept().
1072                  */
1073                 tcp_synq_drop(sk, req, prev);
1074                 goto out;
1075
1076         case TCP_SYN_SENT:
1077         case TCP_SYN_RECV:  /* Cannot happen.
1078                                It can f.e. if SYNs crossed.
1079                              */
1080                 if (!sock_owned_by_user(sk)) {
1081                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1082                         sk->sk_err = err;
1083
1084                         sk->sk_error_report(sk);
1085
1086                         tcp_done(sk);
1087                 } else {
1088                         sk->sk_err_soft = err;
1089                 }
1090                 goto out;
1091         }
1092
1093         /* If we've already connected we will keep trying
1094          * until we time out, or the user gives up.
1095          *
1096          * rfc1122 4.2.3.9 allows to consider as hard errors
1097          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1098          * but it is obsoleted by pmtu discovery).
1099          *
1100          * Note, that in modern internet, where routing is unreliable
1101          * and in each dark corner broken firewalls sit, sending random
1102          * errors ordered by their masters even this two messages finally lose
1103          * their original sense (even Linux sends invalid PORT_UNREACHs)
1104          *
1105          * Now we are in compliance with RFCs.
1106          *                                                      --ANK (980905)
1107          */
1108
1109         inet = inet_sk(sk);
1110         if (!sock_owned_by_user(sk) && inet->recverr) {
1111                 sk->sk_err = err;
1112                 sk->sk_error_report(sk);
1113         } else  { /* Only an error on timeout */
1114                 sk->sk_err_soft = err;
1115         }
1116
1117 out:
1118         bh_unlock_sock(sk);
1119         sock_put(sk);
1120 }
1121
1122 /* This routine computes an IPv4 TCP checksum. */
1123 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1124                        struct sk_buff *skb)
1125 {
1126         struct inet_sock *inet = inet_sk(sk);
1127
1128         if (skb->ip_summed == CHECKSUM_HW) {
1129                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1130                 skb->csum = offsetof(struct tcphdr, check);
1131         } else {
1132                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1133                                          csum_partial((char *)th,
1134                                                       th->doff << 2,
1135                                                       skb->csum));
1136         }
1137 }
1138
1139 /*
1140  *      This routine will send an RST to the other tcp.
1141  *
1142  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1143  *                    for reset.
1144  *      Answer: if a packet caused RST, it is not for a socket
1145  *              existing in our system, if it is matched to a socket,
1146  *              it is just duplicate segment or bug in other side's TCP.
1147  *              So that we build reply only basing on parameters
1148  *              arrived with segment.
1149  *      Exception: precedence violation. We do not implement it in any case.
1150  */
1151
1152 static void tcp_v4_send_reset(struct sk_buff *skb)
1153 {
1154         struct tcphdr *th = skb->h.th;
1155         struct tcphdr rth;
1156         struct ip_reply_arg arg;
1157
1158         /* Never send a reset in response to a reset. */
1159         if (th->rst)
1160                 return;
1161
1162         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1163                 return;
1164
1165         /* Swap the send and the receive. */
1166         memset(&rth, 0, sizeof(struct tcphdr));
1167         rth.dest   = th->source;
1168         rth.source = th->dest;
1169         rth.doff   = sizeof(struct tcphdr) / 4;
1170         rth.rst    = 1;
1171
1172         if (th->ack) {
1173                 rth.seq = th->ack_seq;
1174         } else {
1175                 rth.ack = 1;
1176                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1177                                     skb->len - (th->doff << 2));
1178         }
1179
1180         memset(&arg, 0, sizeof arg);
1181         arg.iov[0].iov_base = (unsigned char *)&rth;
1182         arg.iov[0].iov_len  = sizeof rth;
1183         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1184                                       skb->nh.iph->saddr, /*XXX*/
1185                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1186         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1187
1188         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1189
1190         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1191         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1192 }
1193
1194 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1195    outside socket context is ugly, certainly. What can I do?
1196  */
1197
1198 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1199                             u32 win, u32 ts)
1200 {
1201         struct tcphdr *th = skb->h.th;
1202         struct {
1203                 struct tcphdr th;
1204                 u32 tsopt[3];
1205         } rep;
1206         struct ip_reply_arg arg;
1207
1208         memset(&rep.th, 0, sizeof(struct tcphdr));
1209         memset(&arg, 0, sizeof arg);
1210
1211         arg.iov[0].iov_base = (unsigned char *)&rep;
1212         arg.iov[0].iov_len  = sizeof(rep.th);
1213         if (ts) {
1214                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1215                                      (TCPOPT_TIMESTAMP << 8) |
1216                                      TCPOLEN_TIMESTAMP);
1217                 rep.tsopt[1] = htonl(tcp_time_stamp);
1218                 rep.tsopt[2] = htonl(ts);
1219                 arg.iov[0].iov_len = sizeof(rep);
1220         }
1221
1222         /* Swap the send and the receive. */
1223         rep.th.dest    = th->source;
1224         rep.th.source  = th->dest;
1225         rep.th.doff    = arg.iov[0].iov_len / 4;
1226         rep.th.seq     = htonl(seq);
1227         rep.th.ack_seq = htonl(ack);
1228         rep.th.ack     = 1;
1229         rep.th.window  = htons(win);
1230
1231         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1232                                       skb->nh.iph->saddr, /*XXX*/
1233                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1234         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1235
1236         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1237
1238         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1239 }
1240
1241 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1242 {
1243         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1244
1245         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1246                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1247
1248         tcp_tw_put(tw);
1249 }
1250
1251 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1252 {
1253         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1254                         req->ts_recent);
1255 }
1256
1257 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1258                                           struct request_sock *req)
1259 {
1260         struct rtable *rt;
1261         const struct inet_request_sock *ireq = inet_rsk(req);
1262         struct ip_options *opt = inet_rsk(req)->opt;
1263         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1264                             .nl_u = { .ip4_u =
1265                                       { .daddr = ((opt && opt->srr) ?
1266                                                   opt->faddr :
1267                                                   ireq->rmt_addr),
1268                                         .saddr = ireq->loc_addr,
1269                                         .tos = RT_CONN_FLAGS(sk) } },
1270                             .proto = IPPROTO_TCP,
1271                             .uli_u = { .ports =
1272                                        { .sport = inet_sk(sk)->sport,
1273                                          .dport = ireq->rmt_port } } };
1274
1275         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1276                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1277                 return NULL;
1278         }
1279         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1280                 ip_rt_put(rt);
1281                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282                 return NULL;
1283         }
1284         return &rt->u.dst;
1285 }
1286
1287 /*
1288  *      Send a SYN-ACK after having received an ACK.
1289  *      This still operates on a request_sock only, not on a big
1290  *      socket.
1291  */
1292 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1293                               struct dst_entry *dst)
1294 {
1295         const struct inet_request_sock *ireq = inet_rsk(req);
1296         int err = -1;
1297         struct sk_buff * skb;
1298
1299         /* First, grab a route. */
1300         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1301                 goto out;
1302
1303         skb = tcp_make_synack(sk, dst, req);
1304
1305         if (skb) {
1306                 struct tcphdr *th = skb->h.th;
1307
1308                 th->check = tcp_v4_check(th, skb->len,
1309                                          ireq->loc_addr,
1310                                          ireq->rmt_addr,
1311                                          csum_partial((char *)th, skb->len,
1312                                                       skb->csum));
1313
1314                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1315                                             ireq->rmt_addr,
1316                                             ireq->opt);
1317                 if (err == NET_XMIT_CN)
1318                         err = 0;
1319         }
1320
1321 out:
1322         dst_release(dst);
1323         return err;
1324 }
1325
1326 /*
1327  *      IPv4 request_sock destructor.
1328  */
1329 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1330 {
1331         if (inet_rsk(req)->opt)
1332                 kfree(inet_rsk(req)->opt);
1333 }
1334
1335 static inline void syn_flood_warning(struct sk_buff *skb)
1336 {
1337         static unsigned long warntime;
1338
1339         if (time_after(jiffies, (warntime + HZ * 60))) {
1340                 warntime = jiffies;
1341                 printk(KERN_INFO
1342                        "possible SYN flooding on port %d. Sending cookies.\n",
1343                        ntohs(skb->h.th->dest));
1344         }
1345 }
1346
1347 /*
1348  * Save and compile IPv4 options into the request_sock if needed.
1349  */
1350 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1351                                                      struct sk_buff *skb)
1352 {
1353         struct ip_options *opt = &(IPCB(skb)->opt);
1354         struct ip_options *dopt = NULL;
1355
1356         if (opt && opt->optlen) {
1357                 int opt_size = optlength(opt);
1358                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1359                 if (dopt) {
1360                         if (ip_options_echo(dopt, skb)) {
1361                                 kfree(dopt);
1362                                 dopt = NULL;
1363                         }
1364                 }
1365         }
1366         return dopt;
1367 }
1368
1369 struct request_sock_ops tcp_request_sock_ops = {
1370         .family         =       PF_INET,
1371         .obj_size       =       sizeof(struct tcp_request_sock),
1372         .rtx_syn_ack    =       tcp_v4_send_synack,
1373         .send_ack       =       tcp_v4_reqsk_send_ack,
1374         .destructor     =       tcp_v4_reqsk_destructor,
1375         .send_reset     =       tcp_v4_send_reset,
1376 };
1377
1378 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1379 {
1380         struct inet_request_sock *ireq;
1381         struct tcp_options_received tmp_opt;
1382         struct request_sock *req;
1383         __u32 saddr = skb->nh.iph->saddr;
1384         __u32 daddr = skb->nh.iph->daddr;
1385         __u32 isn = TCP_SKB_CB(skb)->when;
1386         struct dst_entry *dst = NULL;
1387 #ifdef CONFIG_SYN_COOKIES
1388         int want_cookie = 0;
1389 #else
1390 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1391 #endif
1392
1393         /* Never answer to SYNs send to broadcast or multicast */
1394         if (((struct rtable *)skb->dst)->rt_flags &
1395             (RTCF_BROADCAST | RTCF_MULTICAST))
1396                 goto drop;
1397
1398         /* TW buckets are converted to open requests without
1399          * limitations, they conserve resources and peer is
1400          * evidently real one.
1401          */
1402         if (tcp_synq_is_full(sk) && !isn) {
1403 #ifdef CONFIG_SYN_COOKIES
1404                 if (sysctl_tcp_syncookies) {
1405                         want_cookie = 1;
1406                 } else
1407 #endif
1408                 goto drop;
1409         }
1410
1411         /* Accept backlog is full. If we have already queued enough
1412          * of warm entries in syn queue, drop request. It is better than
1413          * clogging syn queue with openreqs with exponentially increasing
1414          * timeout.
1415          */
1416         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1417                 goto drop;
1418
1419         req = reqsk_alloc(&tcp_request_sock_ops);
1420         if (!req)
1421                 goto drop;
1422
1423         tcp_clear_options(&tmp_opt);
1424         tmp_opt.mss_clamp = 536;
1425         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1426
1427         tcp_parse_options(skb, &tmp_opt, 0);
1428
1429         if (want_cookie) {
1430                 tcp_clear_options(&tmp_opt);
1431                 tmp_opt.saw_tstamp = 0;
1432         }
1433
1434         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1435                 /* Some OSes (unknown ones, but I see them on web server, which
1436                  * contains information interesting only for windows'
1437                  * users) do not send their stamp in SYN. It is easy case.
1438                  * We simply do not advertise TS support.
1439                  */
1440                 tmp_opt.saw_tstamp = 0;
1441                 tmp_opt.tstamp_ok  = 0;
1442         }
1443         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1444
1445         tcp_openreq_init(req, &tmp_opt, skb);
1446
1447         ireq = inet_rsk(req);
1448         ireq->loc_addr = daddr;
1449         ireq->rmt_addr = saddr;
1450         ireq->opt = tcp_v4_save_options(sk, skb);
1451         if (!want_cookie)
1452                 TCP_ECN_create_request(req, skb->h.th);
1453
1454         if (want_cookie) {
1455 #ifdef CONFIG_SYN_COOKIES
1456                 syn_flood_warning(skb);
1457 #endif
1458                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1459         } else if (!isn) {
1460                 struct inet_peer *peer = NULL;
1461
1462                 /* VJ's idea. We save last timestamp seen
1463                  * from the destination in peer table, when entering
1464                  * state TIME-WAIT, and check against it before
1465                  * accepting new connection request.
1466                  *
1467                  * If "isn" is not zero, this request hit alive
1468                  * timewait bucket, so that all the necessary checks
1469                  * are made in the function processing timewait state.
1470                  */
1471                 if (tmp_opt.saw_tstamp &&
1472                     sysctl_tcp_tw_recycle &&
1473                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1474                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1475                     peer->v4daddr == saddr) {
1476                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1477                             (s32)(peer->tcp_ts - req->ts_recent) >
1478                                                         TCP_PAWS_WINDOW) {
1479                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1480                                 dst_release(dst);
1481                                 goto drop_and_free;
1482                         }
1483                 }
1484                 /* Kill the following clause, if you dislike this way. */
1485                 else if (!sysctl_tcp_syncookies &&
1486                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1487                           (sysctl_max_syn_backlog >> 2)) &&
1488                          (!peer || !peer->tcp_ts_stamp) &&
1489                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1490                         /* Without syncookies last quarter of
1491                          * backlog is filled with destinations,
1492                          * proven to be alive.
1493                          * It means that we continue to communicate
1494                          * to destinations, already remembered
1495                          * to the moment of synflood.
1496                          */
1497                         NETDEBUG(if (net_ratelimit()) \
1498                                         printk(KERN_DEBUG "TCP: drop open "
1499                                                           "request from %u.%u."
1500                                                           "%u.%u/%u\n", \
1501                                                NIPQUAD(saddr),
1502                                                ntohs(skb->h.th->source)));
1503                         dst_release(dst);
1504                         goto drop_and_free;
1505                 }
1506
1507                 isn = tcp_v4_init_sequence(sk, skb);
1508         }
1509         tcp_rsk(req)->snt_isn = isn;
1510
1511         if (tcp_v4_send_synack(sk, req, dst))
1512                 goto drop_and_free;
1513
1514         if (want_cookie) {
1515                 reqsk_free(req);
1516         } else {
1517                 tcp_v4_synq_add(sk, req);
1518         }
1519         return 0;
1520
1521 drop_and_free:
1522         reqsk_free(req);
1523 drop:
1524         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1525         return 0;
1526 }
1527
1528
1529 /*
1530  * The three way handshake has completed - we got a valid synack -
1531  * now create the new socket.
1532  */
1533 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1534                                   struct request_sock *req,
1535                                   struct dst_entry *dst)
1536 {
1537         struct inet_request_sock *ireq;
1538         struct inet_sock *newinet;
1539         struct tcp_sock *newtp;
1540         struct sock *newsk;
1541
1542         if (sk_acceptq_is_full(sk))
1543                 goto exit_overflow;
1544
1545         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1546                 goto exit;
1547
1548         newsk = tcp_create_openreq_child(sk, req, skb);
1549         if (!newsk)
1550                 goto exit;
1551
1552         newsk->sk_dst_cache = dst;
1553         tcp_v4_setup_caps(newsk, dst);
1554
1555         newtp                 = tcp_sk(newsk);
1556         newinet               = inet_sk(newsk);
1557         ireq                  = inet_rsk(req);
1558         newinet->daddr        = ireq->rmt_addr;
1559         newinet->rcv_saddr    = ireq->loc_addr;
1560         newinet->saddr        = ireq->loc_addr;
1561         newinet->opt          = ireq->opt;
1562         ireq->opt             = NULL;
1563         newinet->mc_index     = tcp_v4_iif(skb);
1564         newinet->mc_ttl       = skb->nh.iph->ttl;
1565         newtp->ext_header_len = 0;
1566         if (newinet->opt)
1567                 newtp->ext_header_len = newinet->opt->optlen;
1568         newinet->id = newtp->write_seq ^ jiffies;
1569
1570         tcp_sync_mss(newsk, dst_mtu(dst));
1571         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1572         tcp_initialize_rcv_mss(newsk);
1573
1574         __tcp_v4_hash(newsk, 0);
1575         __tcp_inherit_port(sk, newsk);
1576
1577         return newsk;
1578
1579 exit_overflow:
1580         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1581 exit:
1582         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1583         dst_release(dst);
1584         return NULL;
1585 }
1586
1587 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1588 {
1589         struct tcphdr *th = skb->h.th;
1590         struct iphdr *iph = skb->nh.iph;
1591         struct tcp_sock *tp = tcp_sk(sk);
1592         struct sock *nsk;
1593         struct request_sock **prev;
1594         /* Find possible connection requests. */
1595         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1596                                                      iph->saddr, iph->daddr);
1597         if (req)
1598                 return tcp_check_req(sk, skb, req, prev);
1599
1600         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1601                                           th->source,
1602                                           skb->nh.iph->daddr,
1603                                           ntohs(th->dest),
1604                                           tcp_v4_iif(skb));
1605
1606         if (nsk) {
1607                 if (nsk->sk_state != TCP_TIME_WAIT) {
1608                         bh_lock_sock(nsk);
1609                         return nsk;
1610                 }
1611                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1612                 return NULL;
1613         }
1614
1615 #ifdef CONFIG_SYN_COOKIES
1616         if (!th->rst && !th->syn && th->ack)
1617                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1618 #endif
1619         return sk;
1620 }
1621
1622 static int tcp_v4_checksum_init(struct sk_buff *skb)
1623 {
1624         if (skb->ip_summed == CHECKSUM_HW) {
1625                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1626                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1627                                   skb->nh.iph->daddr, skb->csum))
1628                         return 0;
1629
1630                 NETDEBUG(if (net_ratelimit())
1631                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1632                 skb->ip_summed = CHECKSUM_NONE;
1633         }
1634         if (skb->len <= 76) {
1635                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1636                                  skb->nh.iph->daddr,
1637                                  skb_checksum(skb, 0, skb->len, 0)))
1638                         return -1;
1639                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1640         } else {
1641                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1642                                           skb->nh.iph->saddr,
1643                                           skb->nh.iph->daddr, 0);
1644         }
1645         return 0;
1646 }
1647
1648
1649 /* The socket must have it's spinlock held when we get
1650  * here.
1651  *
1652  * We have a potential double-lock case here, so even when
1653  * doing backlog processing we use the BH locking scheme.
1654  * This is because we cannot sleep with the original spinlock
1655  * held.
1656  */
1657 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1658 {
1659         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1660                 TCP_CHECK_TIMER(sk);
1661                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1662                         goto reset;
1663                 TCP_CHECK_TIMER(sk);
1664                 return 0;
1665         }
1666
1667         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1668                 goto csum_err;
1669
1670         if (sk->sk_state == TCP_LISTEN) {
1671                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1672                 if (!nsk)
1673                         goto discard;
1674
1675                 if (nsk != sk) {
1676                         if (tcp_child_process(sk, nsk, skb))
1677                                 goto reset;
1678                         return 0;
1679                 }
1680         }
1681
1682         TCP_CHECK_TIMER(sk);
1683         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1684                 goto reset;
1685         TCP_CHECK_TIMER(sk);
1686         return 0;
1687
1688 reset:
1689         tcp_v4_send_reset(skb);
1690 discard:
1691         kfree_skb(skb);
1692         /* Be careful here. If this function gets more complicated and
1693          * gcc suffers from register pressure on the x86, sk (in %ebx)
1694          * might be destroyed here. This current version compiles correctly,
1695          * but you have been warned.
1696          */
1697         return 0;
1698
1699 csum_err:
1700         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1701         goto discard;
1702 }
1703
1704 /*
1705  *      From tcp_input.c
1706  */
1707
1708 int tcp_v4_rcv(struct sk_buff *skb)
1709 {
1710         struct tcphdr *th;
1711         struct sock *sk;
1712         int ret;
1713
1714         if (skb->pkt_type != PACKET_HOST)
1715                 goto discard_it;
1716
1717         /* Count it even if it's bad */
1718         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1719
1720         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1721                 goto discard_it;
1722
1723         th = skb->h.th;
1724
1725         if (th->doff < sizeof(struct tcphdr) / 4)
1726                 goto bad_packet;
1727         if (!pskb_may_pull(skb, th->doff * 4))
1728                 goto discard_it;
1729
1730         /* An explanation is required here, I think.
1731          * Packet length and doff are validated by header prediction,
1732          * provided case of th->doff==0 is elimineted.
1733          * So, we defer the checks. */
1734         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1735              tcp_v4_checksum_init(skb) < 0))
1736                 goto bad_packet;
1737
1738         th = skb->h.th;
1739         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1740         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1741                                     skb->len - th->doff * 4);
1742         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1743         TCP_SKB_CB(skb)->when    = 0;
1744         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1745         TCP_SKB_CB(skb)->sacked  = 0;
1746
1747         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1748                              skb->nh.iph->daddr, ntohs(th->dest),
1749                              tcp_v4_iif(skb));
1750
1751         if (!sk)
1752                 goto no_tcp_socket;
1753
1754 process:
1755         if (sk->sk_state == TCP_TIME_WAIT)
1756                 goto do_time_wait;
1757
1758         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1759                 goto discard_and_relse;
1760
1761         if (sk_filter(sk, skb, 0))
1762                 goto discard_and_relse;
1763
1764         skb->dev = NULL;
1765
1766         bh_lock_sock(sk);
1767         ret = 0;
1768         if (!sock_owned_by_user(sk)) {
1769                 if (!tcp_prequeue(sk, skb))
1770                         ret = tcp_v4_do_rcv(sk, skb);
1771         } else
1772                 sk_add_backlog(sk, skb);
1773         bh_unlock_sock(sk);
1774
1775         sock_put(sk);
1776
1777         return ret;
1778
1779 no_tcp_socket:
1780         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1781                 goto discard_it;
1782
1783         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1784 bad_packet:
1785                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1786         } else {
1787                 tcp_v4_send_reset(skb);
1788         }
1789
1790 discard_it:
1791         /* Discard frame. */
1792         kfree_skb(skb);
1793         return 0;
1794
1795 discard_and_relse:
1796         sock_put(sk);
1797         goto discard_it;
1798
1799 do_time_wait:
1800         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1801                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1802                 goto discard_it;
1803         }
1804
1805         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1806                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1807                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1808                 goto discard_it;
1809         }
1810         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1811                                            skb, th, skb->len)) {
1812         case TCP_TW_SYN: {
1813                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1814                                                           ntohs(th->dest),
1815                                                           tcp_v4_iif(skb));
1816                 if (sk2) {
1817                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1818                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1819                         sk = sk2;
1820                         goto process;
1821                 }
1822                 /* Fall through to ACK */
1823         }
1824         case TCP_TW_ACK:
1825                 tcp_v4_timewait_ack(sk, skb);
1826                 break;
1827         case TCP_TW_RST:
1828                 goto no_tcp_socket;
1829         case TCP_TW_SUCCESS:;
1830         }
1831         goto discard_it;
1832 }
1833
1834 /* With per-bucket locks this operation is not-atomic, so that
1835  * this version is not worse.
1836  */
1837 static void __tcp_v4_rehash(struct sock *sk)
1838 {
1839         sk->sk_prot->unhash(sk);
1840         sk->sk_prot->hash(sk);
1841 }
1842
1843 static int tcp_v4_reselect_saddr(struct sock *sk)
1844 {
1845         struct inet_sock *inet = inet_sk(sk);
1846         int err;
1847         struct rtable *rt;
1848         __u32 old_saddr = inet->saddr;
1849         __u32 new_saddr;
1850         __u32 daddr = inet->daddr;
1851
1852         if (inet->opt && inet->opt->srr)
1853                 daddr = inet->opt->faddr;
1854
1855         /* Query new route. */
1856         err = ip_route_connect(&rt, daddr, 0,
1857                                RT_CONN_FLAGS(sk),
1858                                sk->sk_bound_dev_if,
1859                                IPPROTO_TCP,
1860                                inet->sport, inet->dport, sk);
1861         if (err)
1862                 return err;
1863
1864         __sk_dst_set(sk, &rt->u.dst);
1865         tcp_v4_setup_caps(sk, &rt->u.dst);
1866
1867         new_saddr = rt->rt_src;
1868
1869         if (new_saddr == old_saddr)
1870                 return 0;
1871
1872         if (sysctl_ip_dynaddr > 1) {
1873                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1874                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1875                        NIPQUAD(old_saddr),
1876                        NIPQUAD(new_saddr));
1877         }
1878
1879         inet->saddr = new_saddr;
1880         inet->rcv_saddr = new_saddr;
1881
1882         /* XXX The only one ugly spot where we need to
1883          * XXX really change the sockets identity after
1884          * XXX it has entered the hashes. -DaveM
1885          *
1886          * Besides that, it does not check for connection
1887          * uniqueness. Wait for troubles.
1888          */
1889         __tcp_v4_rehash(sk);
1890         return 0;
1891 }
1892
1893 int tcp_v4_rebuild_header(struct sock *sk)
1894 {
1895         struct inet_sock *inet = inet_sk(sk);
1896         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1897         u32 daddr;
1898         int err;
1899
1900         /* Route is OK, nothing to do. */
1901         if (rt)
1902                 return 0;
1903
1904         /* Reroute. */
1905         daddr = inet->daddr;
1906         if (inet->opt && inet->opt->srr)
1907                 daddr = inet->opt->faddr;
1908
1909         {
1910                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1911                                     .nl_u = { .ip4_u =
1912                                               { .daddr = daddr,
1913                                                 .saddr = inet->saddr,
1914                                                 .tos = RT_CONN_FLAGS(sk) } },
1915                                     .proto = IPPROTO_TCP,
1916                                     .uli_u = { .ports =
1917                                                { .sport = inet->sport,
1918                                                  .dport = inet->dport } } };
1919
1920                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1921         }
1922         if (!err) {
1923                 __sk_dst_set(sk, &rt->u.dst);
1924                 tcp_v4_setup_caps(sk, &rt->u.dst);
1925                 return 0;
1926         }
1927
1928         /* Routing failed... */
1929         sk->sk_route_caps = 0;
1930
1931         if (!sysctl_ip_dynaddr ||
1932             sk->sk_state != TCP_SYN_SENT ||
1933             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1934             (err = tcp_v4_reselect_saddr(sk)) != 0)
1935                 sk->sk_err_soft = -err;
1936
1937         return err;
1938 }
1939
1940 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1941 {
1942         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1943         struct inet_sock *inet = inet_sk(sk);
1944
1945         sin->sin_family         = AF_INET;
1946         sin->sin_addr.s_addr    = inet->daddr;
1947         sin->sin_port           = inet->dport;
1948 }
1949
1950 /* VJ's idea. Save last timestamp seen from this destination
1951  * and hold it at least for normal timewait interval to use for duplicate
1952  * segment detection in subsequent connections, before they enter synchronized
1953  * state.
1954  */
1955
1956 int tcp_v4_remember_stamp(struct sock *sk)
1957 {
1958         struct inet_sock *inet = inet_sk(sk);
1959         struct tcp_sock *tp = tcp_sk(sk);
1960         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1961         struct inet_peer *peer = NULL;
1962         int release_it = 0;
1963
1964         if (!rt || rt->rt_dst != inet->daddr) {
1965                 peer = inet_getpeer(inet->daddr, 1);
1966                 release_it = 1;
1967         } else {
1968                 if (!rt->peer)
1969                         rt_bind_peer(rt, 1);
1970                 peer = rt->peer;
1971         }
1972
1973         if (peer) {
1974                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1975                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1976                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1977                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1978                         peer->tcp_ts = tp->rx_opt.ts_recent;
1979                 }
1980                 if (release_it)
1981                         inet_putpeer(peer);
1982                 return 1;
1983         }
1984
1985         return 0;
1986 }
1987
1988 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1989 {
1990         struct inet_peer *peer = NULL;
1991
1992         peer = inet_getpeer(tw->tw_daddr, 1);
1993
1994         if (peer) {
1995                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1996                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1997                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1998                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1999                         peer->tcp_ts = tw->tw_ts_recent;
2000                 }
2001                 inet_putpeer(peer);
2002                 return 1;
2003         }
2004
2005         return 0;
2006 }
2007
2008 struct tcp_func ipv4_specific = {
2009         .queue_xmit     =       ip_queue_xmit,
2010         .send_check     =       tcp_v4_send_check,
2011         .rebuild_header =       tcp_v4_rebuild_header,
2012         .conn_request   =       tcp_v4_conn_request,
2013         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2014         .remember_stamp =       tcp_v4_remember_stamp,
2015         .net_header_len =       sizeof(struct iphdr),
2016         .setsockopt     =       ip_setsockopt,
2017         .getsockopt     =       ip_getsockopt,
2018         .addr2sockaddr  =       v4_addr2sockaddr,
2019         .sockaddr_len   =       sizeof(struct sockaddr_in),
2020 };
2021
2022 /* NOTE: A lot of things set to zero explicitly by call to
2023  *       sk_alloc() so need not be done here.
2024  */
2025 static int tcp_v4_init_sock(struct sock *sk)
2026 {
2027         struct tcp_sock *tp = tcp_sk(sk);
2028
2029         skb_queue_head_init(&tp->out_of_order_queue);
2030         tcp_init_xmit_timers(sk);
2031         tcp_prequeue_init(tp);
2032
2033         tp->rto  = TCP_TIMEOUT_INIT;
2034         tp->mdev = TCP_TIMEOUT_INIT;
2035
2036         /* So many TCP implementations out there (incorrectly) count the
2037          * initial SYN frame in their delayed-ACK and congestion control
2038          * algorithms that we must have the following bandaid to talk
2039          * efficiently to them.  -DaveM
2040          */
2041         tp->snd_cwnd = 2;
2042
2043         /* See draft-stevens-tcpca-spec-01 for discussion of the
2044          * initialization of these values.
2045          */
2046         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2047         tp->snd_cwnd_clamp = ~0;
2048         tp->mss_cache = 536;
2049
2050         tp->reordering = sysctl_tcp_reordering;
2051         tp->ca_ops = &tcp_init_congestion_ops;
2052
2053         sk->sk_state = TCP_CLOSE;
2054
2055         sk->sk_write_space = sk_stream_write_space;
2056         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2057
2058         tp->af_specific = &ipv4_specific;
2059
2060         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2061         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2062
2063         atomic_inc(&tcp_sockets_allocated);
2064
2065         return 0;
2066 }
2067
2068 int tcp_v4_destroy_sock(struct sock *sk)
2069 {
2070         struct tcp_sock *tp = tcp_sk(sk);
2071
2072         tcp_clear_xmit_timers(sk);
2073
2074         tcp_cleanup_congestion_control(tp);
2075
2076         /* Cleanup up the write buffer. */
2077         sk_stream_writequeue_purge(sk);
2078
2079         /* Cleans up our, hopefully empty, out_of_order_queue. */
2080         __skb_queue_purge(&tp->out_of_order_queue);
2081
2082         /* Clean prequeue, it must be empty really */
2083         __skb_queue_purge(&tp->ucopy.prequeue);
2084
2085         /* Clean up a referenced TCP bind bucket. */
2086         if (tp->bind_hash)
2087                 tcp_put_port(sk);
2088
2089         /*
2090          * If sendmsg cached page exists, toss it.
2091          */
2092         if (sk->sk_sndmsg_page) {
2093                 __free_page(sk->sk_sndmsg_page);
2094                 sk->sk_sndmsg_page = NULL;
2095         }
2096
2097         atomic_dec(&tcp_sockets_allocated);
2098
2099         return 0;
2100 }
2101
2102 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2103
2104 #ifdef CONFIG_PROC_FS
2105 /* Proc filesystem TCP sock list dumping. */
2106
2107 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2108 {
2109         return hlist_empty(head) ? NULL :
2110                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2111 }
2112
2113 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2114 {
2115         return tw->tw_node.next ?
2116                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2117 }
2118
2119 static void *listening_get_next(struct seq_file *seq, void *cur)
2120 {
2121         struct tcp_sock *tp;
2122         struct hlist_node *node;
2123         struct sock *sk = cur;
2124         struct tcp_iter_state* st = seq->private;
2125
2126         if (!sk) {
2127                 st->bucket = 0;
2128                 sk = sk_head(&tcp_listening_hash[0]);
2129                 goto get_sk;
2130         }
2131
2132         ++st->num;
2133
2134         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2135                 struct request_sock *req = cur;
2136
2137                 tp = tcp_sk(st->syn_wait_sk);
2138                 req = req->dl_next;
2139                 while (1) {
2140                         while (req) {
2141                                 if (req->rsk_ops->family == st->family) {
2142                                         cur = req;
2143                                         goto out;
2144                                 }
2145                                 req = req->dl_next;
2146                         }
2147                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2148                                 break;
2149 get_req:
2150                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2151                 }
2152                 sk        = sk_next(st->syn_wait_sk);
2153                 st->state = TCP_SEQ_STATE_LISTENING;
2154                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2155         } else {
2156                 tp = tcp_sk(sk);
2157                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2158                 if (reqsk_queue_len(&tp->accept_queue))
2159                         goto start_req;
2160                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2161                 sk = sk_next(sk);
2162         }
2163 get_sk:
2164         sk_for_each_from(sk, node) {
2165                 if (sk->sk_family == st->family) {
2166                         cur = sk;
2167                         goto out;
2168                 }
2169                 tp = tcp_sk(sk);
2170                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2171                 if (reqsk_queue_len(&tp->accept_queue)) {
2172 start_req:
2173                         st->uid         = sock_i_uid(sk);
2174                         st->syn_wait_sk = sk;
2175                         st->state       = TCP_SEQ_STATE_OPENREQ;
2176                         st->sbucket     = 0;
2177                         goto get_req;
2178                 }
2179                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2180         }
2181         if (++st->bucket < TCP_LHTABLE_SIZE) {
2182                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2183                 goto get_sk;
2184         }
2185         cur = NULL;
2186 out:
2187         return cur;
2188 }
2189
2190 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2191 {
2192         void *rc = listening_get_next(seq, NULL);
2193
2194         while (rc && *pos) {
2195                 rc = listening_get_next(seq, rc);
2196                 --*pos;
2197         }
2198         return rc;
2199 }
2200
2201 static void *established_get_first(struct seq_file *seq)
2202 {
2203         struct tcp_iter_state* st = seq->private;
2204         void *rc = NULL;
2205
2206         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2207                 struct sock *sk;
2208                 struct hlist_node *node;
2209                 struct tcp_tw_bucket *tw;
2210
2211                 /* We can reschedule _before_ having picked the target: */
2212                 cond_resched_softirq();
2213
2214                 read_lock(&tcp_ehash[st->bucket].lock);
2215                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2216                         if (sk->sk_family != st->family) {
2217                                 continue;
2218                         }
2219                         rc = sk;
2220                         goto out;
2221                 }
2222                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2223                 tw_for_each(tw, node,
2224                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2225                         if (tw->tw_family != st->family) {
2226                                 continue;
2227                         }
2228                         rc = tw;
2229                         goto out;
2230                 }
2231                 read_unlock(&tcp_ehash[st->bucket].lock);
2232                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2233         }
2234 out:
2235         return rc;
2236 }
2237
2238 static void *established_get_next(struct seq_file *seq, void *cur)
2239 {
2240         struct sock *sk = cur;
2241         struct tcp_tw_bucket *tw;
2242         struct hlist_node *node;
2243         struct tcp_iter_state* st = seq->private;
2244
2245         ++st->num;
2246
2247         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2248                 tw = cur;
2249                 tw = tw_next(tw);
2250 get_tw:
2251                 while (tw && tw->tw_family != st->family) {
2252                         tw = tw_next(tw);
2253                 }
2254                 if (tw) {
2255                         cur = tw;
2256                         goto out;
2257                 }
2258                 read_unlock(&tcp_ehash[st->bucket].lock);
2259                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2260
2261                 /* We can reschedule between buckets: */
2262                 cond_resched_softirq();
2263
2264                 if (++st->bucket < tcp_ehash_size) {
2265                         read_lock(&tcp_ehash[st->bucket].lock);
2266                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2267                 } else {
2268                         cur = NULL;
2269                         goto out;
2270                 }
2271         } else
2272                 sk = sk_next(sk);
2273
2274         sk_for_each_from(sk, node) {
2275                 if (sk->sk_family == st->family)
2276                         goto found;
2277         }
2278
2279         st->state = TCP_SEQ_STATE_TIME_WAIT;
2280         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2281         goto get_tw;
2282 found:
2283         cur = sk;
2284 out:
2285         return cur;
2286 }
2287
2288 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2289 {
2290         void *rc = established_get_first(seq);
2291
2292         while (rc && pos) {
2293                 rc = established_get_next(seq, rc);
2294                 --pos;
2295         }
2296         return rc;
2297 }
2298
2299 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2300 {
2301         void *rc;
2302         struct tcp_iter_state* st = seq->private;
2303
2304         tcp_listen_lock();
2305         st->state = TCP_SEQ_STATE_LISTENING;
2306         rc        = listening_get_idx(seq, &pos);
2307
2308         if (!rc) {
2309                 tcp_listen_unlock();
2310                 local_bh_disable();
2311                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2312                 rc        = established_get_idx(seq, pos);
2313         }
2314
2315         return rc;
2316 }
2317
2318 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2319 {
2320         struct tcp_iter_state* st = seq->private;
2321         st->state = TCP_SEQ_STATE_LISTENING;
2322         st->num = 0;
2323         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2324 }
2325
2326 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2327 {
2328         void *rc = NULL;
2329         struct tcp_iter_state* st;
2330
2331         if (v == SEQ_START_TOKEN) {
2332                 rc = tcp_get_idx(seq, 0);
2333                 goto out;
2334         }
2335         st = seq->private;
2336
2337         switch (st->state) {
2338         case TCP_SEQ_STATE_OPENREQ:
2339         case TCP_SEQ_STATE_LISTENING:
2340                 rc = listening_get_next(seq, v);
2341                 if (!rc) {
2342                         tcp_listen_unlock();
2343                         local_bh_disable();
2344                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2345                         rc        = established_get_first(seq);
2346                 }
2347                 break;
2348         case TCP_SEQ_STATE_ESTABLISHED:
2349         case TCP_SEQ_STATE_TIME_WAIT:
2350                 rc = established_get_next(seq, v);
2351                 break;
2352         }
2353 out:
2354         ++*pos;
2355         return rc;
2356 }
2357
2358 static void tcp_seq_stop(struct seq_file *seq, void *v)
2359 {
2360         struct tcp_iter_state* st = seq->private;
2361
2362         switch (st->state) {
2363         case TCP_SEQ_STATE_OPENREQ:
2364                 if (v) {
2365                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2366                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2367                 }
2368         case TCP_SEQ_STATE_LISTENING:
2369                 if (v != SEQ_START_TOKEN)
2370                         tcp_listen_unlock();
2371                 break;
2372         case TCP_SEQ_STATE_TIME_WAIT:
2373         case TCP_SEQ_STATE_ESTABLISHED:
2374                 if (v)
2375                         read_unlock(&tcp_ehash[st->bucket].lock);
2376                 local_bh_enable();
2377                 break;
2378         }
2379 }
2380
2381 static int tcp_seq_open(struct inode *inode, struct file *file)
2382 {
2383         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2384         struct seq_file *seq;
2385         struct tcp_iter_state *s;
2386         int rc;
2387
2388         if (unlikely(afinfo == NULL))
2389                 return -EINVAL;
2390
2391         s = kmalloc(sizeof(*s), GFP_KERNEL);
2392         if (!s)
2393                 return -ENOMEM;
2394         memset(s, 0, sizeof(*s));
2395         s->family               = afinfo->family;
2396         s->seq_ops.start        = tcp_seq_start;
2397         s->seq_ops.next         = tcp_seq_next;
2398         s->seq_ops.show         = afinfo->seq_show;
2399         s->seq_ops.stop         = tcp_seq_stop;
2400
2401         rc = seq_open(file, &s->seq_ops);
2402         if (rc)
2403                 goto out_kfree;
2404         seq          = file->private_data;
2405         seq->private = s;
2406 out:
2407         return rc;
2408 out_kfree:
2409         kfree(s);
2410         goto out;
2411 }
2412
2413 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2414 {
2415         int rc = 0;
2416         struct proc_dir_entry *p;
2417
2418         if (!afinfo)
2419                 return -EINVAL;
2420         afinfo->seq_fops->owner         = afinfo->owner;
2421         afinfo->seq_fops->open          = tcp_seq_open;
2422         afinfo->seq_fops->read          = seq_read;
2423         afinfo->seq_fops->llseek        = seq_lseek;
2424         afinfo->seq_fops->release       = seq_release_private;
2425
2426         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2427         if (p)
2428                 p->data = afinfo;
2429         else
2430                 rc = -ENOMEM;
2431         return rc;
2432 }
2433
2434 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2435 {
2436         if (!afinfo)
2437                 return;
2438         proc_net_remove(afinfo->name);
2439         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2440 }
2441
2442 static void get_openreq4(struct sock *sk, struct request_sock *req,
2443                          char *tmpbuf, int i, int uid)
2444 {
2445         const struct inet_request_sock *ireq = inet_rsk(req);
2446         int ttd = req->expires - jiffies;
2447
2448         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2449                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2450                 i,
2451                 ireq->loc_addr,
2452                 ntohs(inet_sk(sk)->sport),
2453                 ireq->rmt_addr,
2454                 ntohs(ireq->rmt_port),
2455                 TCP_SYN_RECV,
2456                 0, 0, /* could print option size, but that is af dependent. */
2457                 1,    /* timers active (only the expire timer) */
2458                 jiffies_to_clock_t(ttd),
2459                 req->retrans,
2460                 uid,
2461                 0,  /* non standard timer */
2462                 0, /* open_requests have no inode */
2463                 atomic_read(&sk->sk_refcnt),
2464                 req);
2465 }
2466
2467 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2468 {
2469         int timer_active;
2470         unsigned long timer_expires;
2471         struct tcp_sock *tp = tcp_sk(sp);
2472         struct inet_sock *inet = inet_sk(sp);
2473         unsigned int dest = inet->daddr;
2474         unsigned int src = inet->rcv_saddr;
2475         __u16 destp = ntohs(inet->dport);
2476         __u16 srcp = ntohs(inet->sport);
2477
2478         if (tp->pending == TCP_TIME_RETRANS) {
2479                 timer_active    = 1;
2480                 timer_expires   = tp->timeout;
2481         } else if (tp->pending == TCP_TIME_PROBE0) {
2482                 timer_active    = 4;
2483                 timer_expires   = tp->timeout;
2484         } else if (timer_pending(&sp->sk_timer)) {
2485                 timer_active    = 2;
2486                 timer_expires   = sp->sk_timer.expires;
2487         } else {
2488                 timer_active    = 0;
2489                 timer_expires = jiffies;
2490         }
2491
2492         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2494                 i, src, srcp, dest, destp, sp->sk_state,
2495                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2496                 timer_active,
2497                 jiffies_to_clock_t(timer_expires - jiffies),
2498                 tp->retransmits,
2499                 sock_i_uid(sp),
2500                 tp->probes_out,
2501                 sock_i_ino(sp),
2502                 atomic_read(&sp->sk_refcnt), sp,
2503                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2504                 tp->snd_cwnd,
2505                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2506 }
2507
2508 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2509 {
2510         unsigned int dest, src;
2511         __u16 destp, srcp;
2512         int ttd = tw->tw_ttd - jiffies;
2513
2514         if (ttd < 0)
2515                 ttd = 0;
2516
2517         dest  = tw->tw_daddr;
2518         src   = tw->tw_rcv_saddr;
2519         destp = ntohs(tw->tw_dport);
2520         srcp  = ntohs(tw->tw_sport);
2521
2522         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2523                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2524                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2525                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2526                 atomic_read(&tw->tw_refcnt), tw);
2527 }
2528
2529 #define TMPSZ 150
2530
2531 static int tcp4_seq_show(struct seq_file *seq, void *v)
2532 {
2533         struct tcp_iter_state* st;
2534         char tmpbuf[TMPSZ + 1];
2535
2536         if (v == SEQ_START_TOKEN) {
2537                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2538                            "  sl  local_address rem_address   st tx_queue "
2539                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2540                            "inode");
2541                 goto out;
2542         }
2543         st = seq->private;
2544
2545         switch (st->state) {
2546         case TCP_SEQ_STATE_LISTENING:
2547         case TCP_SEQ_STATE_ESTABLISHED:
2548                 get_tcp4_sock(v, tmpbuf, st->num);
2549                 break;
2550         case TCP_SEQ_STATE_OPENREQ:
2551                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2552                 break;
2553         case TCP_SEQ_STATE_TIME_WAIT:
2554                 get_timewait4_sock(v, tmpbuf, st->num);
2555                 break;
2556         }
2557         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2558 out:
2559         return 0;
2560 }
2561
2562 static struct file_operations tcp4_seq_fops;
2563 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2564         .owner          = THIS_MODULE,
2565         .name           = "tcp",
2566         .family         = AF_INET,
2567         .seq_show       = tcp4_seq_show,
2568         .seq_fops       = &tcp4_seq_fops,
2569 };
2570
2571 int __init tcp4_proc_init(void)
2572 {
2573         return tcp_proc_register(&tcp4_seq_afinfo);
2574 }
2575
2576 void tcp4_proc_exit(void)
2577 {
2578         tcp_proc_unregister(&tcp4_seq_afinfo);
2579 }
2580 #endif /* CONFIG_PROC_FS */
2581
2582 struct proto tcp_prot = {
2583         .name                   = "TCP",
2584         .owner                  = THIS_MODULE,
2585         .close                  = tcp_close,
2586         .connect                = tcp_v4_connect,
2587         .disconnect             = tcp_disconnect,
2588         .accept                 = tcp_accept,
2589         .ioctl                  = tcp_ioctl,
2590         .init                   = tcp_v4_init_sock,
2591         .destroy                = tcp_v4_destroy_sock,
2592         .shutdown               = tcp_shutdown,
2593         .setsockopt             = tcp_setsockopt,
2594         .getsockopt             = tcp_getsockopt,
2595         .sendmsg                = tcp_sendmsg,
2596         .recvmsg                = tcp_recvmsg,
2597         .backlog_rcv            = tcp_v4_do_rcv,
2598         .hash                   = tcp_v4_hash,
2599         .unhash                 = tcp_unhash,
2600         .get_port               = tcp_v4_get_port,
2601         .enter_memory_pressure  = tcp_enter_memory_pressure,
2602         .sockets_allocated      = &tcp_sockets_allocated,
2603         .memory_allocated       = &tcp_memory_allocated,
2604         .memory_pressure        = &tcp_memory_pressure,
2605         .sysctl_mem             = sysctl_tcp_mem,
2606         .sysctl_wmem            = sysctl_tcp_wmem,
2607         .sysctl_rmem            = sysctl_tcp_rmem,
2608         .max_header             = MAX_TCP_HEADER,
2609         .obj_size               = sizeof(struct tcp_sock),
2610         .rsk_prot               = &tcp_request_sock_ops,
2611 };
2612
2613
2614
2615 void __init tcp_v4_init(struct net_proto_family *ops)
2616 {
2617         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2618         if (err < 0)
2619                 panic("Failed to create the TCP control socket.\n");
2620         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2621         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2622
2623         /* Unhash it so that IP input processing does not even
2624          * see it, we do not wish this socket to see incoming
2625          * packets.
2626          */
2627         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2628 }
2629
2630 EXPORT_SYMBOL(ipv4_specific);
2631 EXPORT_SYMBOL(tcp_bind_hash);
2632 EXPORT_SYMBOL(tcp_bucket_create);
2633 EXPORT_SYMBOL(tcp_hashinfo);
2634 EXPORT_SYMBOL(tcp_inherit_port);
2635 EXPORT_SYMBOL(tcp_listen_wlock);
2636 EXPORT_SYMBOL(tcp_port_rover);
2637 EXPORT_SYMBOL(tcp_prot);
2638 EXPORT_SYMBOL(tcp_put_port);
2639 EXPORT_SYMBOL(tcp_unhash);
2640 EXPORT_SYMBOL(tcp_v4_conn_request);
2641 EXPORT_SYMBOL(tcp_v4_connect);
2642 EXPORT_SYMBOL(tcp_v4_do_rcv);
2643 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2644 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2645 EXPORT_SYMBOL(tcp_v4_send_check);
2646 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2647
2648 #ifdef CONFIG_PROC_FS
2649 EXPORT_SYMBOL(tcp_proc_register);
2650 EXPORT_SYMBOL(tcp_proc_unregister);
2651 #endif
2652 EXPORT_SYMBOL(sysctl_local_port_range);
2653 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2654 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2655