net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      open_request handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_sock *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 if (tcp_port_rover < low)
 226                         rover = low;
 227                 else
 228                         rover = tcp_port_rover;
 229                 do {
 230                         rover++;
 231                         if (rover > high)
 232                                 rover = low;
 233                         head = &tcp_bhash[tcp_bhashfn(rover)];
 234                         spin_lock(&head->lock);
 235                         tb_for_each(tb, node, &head->chain)
 236                                 if (tb->port == rover)
 237                                         goto next;
 238                         break;
 239                 next:
 240                         spin_unlock(&head->lock);
 241                 } while (--remaining > 0);
 242                 tcp_port_rover = rover;
 243                 spin_unlock(&tcp_portalloc_lock);
 244
 245                 /* Exhausted local port range during search? */
 246                 ret = 1;
 247                 if (remaining <= 0)
 248                         goto fail;
 249
 250                 /* OK, here is the one we will use.  HEAD is
 251                  * non-NULL and we hold it's mutex.
 252                  */
 253                 snum = rover;
 254         } else {
 255                 head = &tcp_bhash[tcp_bhashfn(snum)];
 256                 spin_lock(&head->lock);
 257                 tb_for_each(tb, node, &head->chain)
 258                         if (tb->port == snum)
 259                                 goto tb_found;
 260         }
 261         tb = NULL;
 262         goto tb_not_found;
 263 tb_found:
 264         if (!hlist_empty(&tb->owners)) {
 265                 if (sk->sk_reuse > 1)
 266                         goto success;
 267                 if (tb->fastreuse > 0 &&
 268                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 269                         goto success;
 270                 } else {
 271                         ret = 1;
 272                         if (tcp_bind_conflict(sk, tb))
 273                                 goto fail_unlock;
 274                 }
 275         }
 276 tb_not_found:
 277         ret = 1;
 278         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 279                 goto fail_unlock;
 280         if (hlist_empty(&tb->owners)) {
 281                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 282                         tb->fastreuse = 1;
 283                 else
 284                         tb->fastreuse = 0;
 285         } else if (tb->fastreuse &&
 286                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 287                 tb->fastreuse = 0;
 288 success:
 289         if (!tcp_sk(sk)->bind_hash)
 290                 tcp_bind_hash(sk, tb, snum);
 291         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 292         ret = 0;
 293
 294 fail_unlock:
 295         spin_unlock(&head->lock);
 296 fail:
 297         local_bh_enable();
 298         return ret;
 299 }
 300
 301 /* Get rid of any references to a local port held by the
 302  * given sock.
 303  */
 304 static void __tcp_put_port(struct sock *sk)
 305 {
 306         struct inet_sock *inet = inet_sk(sk);
 307         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 308         struct tcp_bind_bucket *tb;
 309
 310         spin_lock(&head->lock);
 311         tb = tcp_sk(sk)->bind_hash;
 312         __sk_del_bind_node(sk);
 313         tcp_sk(sk)->bind_hash = NULL;
 314         inet->num = 0;
 315         tcp_bucket_destroy(tb);
 316         spin_unlock(&head->lock);
 317 }
 318
 319 void tcp_put_port(struct sock *sk)
 320 {
 321         local_bh_disable();
 322         __tcp_put_port(sk);
 323         local_bh_enable();
 324 }
 325
 326 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 327  * Look, when several writers sleep and reader wakes them up, all but one
 328  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 329  * this, _but_ remember, it adds useless work on UP machines (wake up each
 330  * exclusive lock release). It should be ifdefed really.
 331  */
 332
 333 void tcp_listen_wlock(void)
 334 {
 335         write_lock(&tcp_lhash_lock);
 336
 337         if (atomic_read(&tcp_lhash_users)) {
 338                 DEFINE_WAIT(wait);
 339
 340                 for (;;) {
 341                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 342                                                 &wait, TASK_UNINTERRUPTIBLE);
 343                         if (!atomic_read(&tcp_lhash_users))
 344                                 break;
 345                         write_unlock_bh(&tcp_lhash_lock);
 346                         schedule();
 347                         write_lock_bh(&tcp_lhash_lock);
 348                 }
 349
 350                 finish_wait(&tcp_lhash_wait, &wait);
 351         }
 352 }
 353
 354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 355 {
 356         struct hlist_head *list;
 357         rwlock_t *lock;
 358
 359         BUG_TRAP(sk_unhashed(sk));
 360         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 361                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 362                 lock = &tcp_lhash_lock;
 363                 tcp_listen_wlock();
 364         } else {
 365                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 366                 lock = &tcp_ehash[sk->sk_hashent].lock;
 367                 write_lock(lock);
 368         }
 369         __sk_add_node(sk, list);
 370         sock_prot_inc_use(sk->sk_prot);
 371         write_unlock(lock);
 372         if (listen_possible && sk->sk_state == TCP_LISTEN)
 373                 wake_up(&tcp_lhash_wait);
 374 }
 375
 376 static void tcp_v4_hash(struct sock *sk)
 377 {
 378         if (sk->sk_state != TCP_CLOSE) {
 379                 local_bh_disable();
 380                 __tcp_v4_hash(sk, 1);
 381                 local_bh_enable();
 382         }
 383 }
 384
 385 void tcp_unhash(struct sock *sk)
 386 {
 387         rwlock_t *lock;
 388
 389         if (sk_unhashed(sk))
 390                 goto ende;
 391
 392         if (sk->sk_state == TCP_LISTEN) {
 393                 local_bh_disable();
 394                 tcp_listen_wlock();
 395                 lock = &tcp_lhash_lock;
 396         } else {
 397                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 398                 lock = &head->lock;
 399                 write_lock_bh(&head->lock);
 400         }
 401
 402         if (__sk_del_node_init(sk))
 403                 sock_prot_dec_use(sk->sk_prot);
 404         write_unlock_bh(lock);
 405
 406  ende:
 407         if (sk->sk_state == TCP_LISTEN)
 408                 wake_up(&tcp_lhash_wait);
 409 }
 410
 411 /* Don't inline this cruft.  Here are some nice properties to
 412  * exploit here.  The BSD API does not allow a listening TCP
 413  * to specify the remote port nor the remote address for the
 414  * connection.  So always assume those are both wildcarded
 415  * during the search since they can never be otherwise.
 416  */
 417 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 418                                              unsigned short hnum, int dif)
 419 {
 420         struct sock *result = NULL, *sk;
 421         struct hlist_node *node;
 422         int score, hiscore;
 423
 424         hiscore=-1;
 425         sk_for_each(sk, node, head) {
 426                 struct inet_sock *inet = inet_sk(sk);
 427
 428                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 429                         __u32 rcv_saddr = inet->rcv_saddr;
 430
 431                         score = (sk->sk_family == PF_INET ? 1 : 0);
 432                         if (rcv_saddr) {
 433                                 if (rcv_saddr != daddr)
 434                                         continue;
 435                                 score+=2;
 436                         }
 437                         if (sk->sk_bound_dev_if) {
 438                                 if (sk->sk_bound_dev_if != dif)
 439                                         continue;
 440                                 score+=2;
 441                         }
 442                         if (score == 5)
 443                                 return sk;
 444                         if (score > hiscore) {
 445                                 hiscore = score;
 446                                 result = sk;
 447                         }
 448                 }
 449         }
 450         return result;
 451 }
 452
 453 /* Optimize the common listener case. */
 454 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 455                 unsigned short hnum, int dif)
 456 {
 457         struct sock *sk = NULL;
 458         struct hlist_head *head;
 459
 460         read_lock(&tcp_lhash_lock);
 461         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 462         if (!hlist_empty(head)) {
 463                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 464
 465                 if (inet->num == hnum && !sk->sk_node.next &&
 466                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 467                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 468                     !sk->sk_bound_dev_if)
 469                         goto sherry_cache;
 470                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 471         }
 472         if (sk) {
 473 sherry_cache:
 474                 sock_hold(sk);
 475         }
 476         read_unlock(&tcp_lhash_lock);
 477         return sk;
 478 }
 479
 480 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 481  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 482  *
 483  * Local BH must be disabled here.
 484  */
 485
 486 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 487                                                        u32 daddr, u16 hnum,
 488                                                        int dif)
 489 {
 490         struct tcp_ehash_bucket *head;
 491         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 492         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 493         struct sock *sk;
 494         struct hlist_node *node;
 495         /* Optimize here for direct hit, only listening connections can
 496          * have wildcards anyways.
 497          */
 498         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 499         head = &tcp_ehash[hash];
 500         read_lock(&head->lock);
 501         sk_for_each(sk, node, &head->chain) {
 502                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 503                         goto hit; /* You sunk my battleship! */
 504         }
 505
 506         /* Must check for a TIME_WAIT'er before going to listener hash. */
 507         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 508                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 509                         goto hit;
 510         }
 511         sk = NULL;
 512 out:
 513         read_unlock(&head->lock);
 514         return sk;
 515 hit:
 516         sock_hold(sk);
 517         goto out;
 518 }
 519
 520 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 521                                            u32 daddr, u16 hnum, int dif)
 522 {
 523         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 524                                                       daddr, hnum, dif);
 525
 526         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 527 }
 528
 529 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 530                                   u16 dport, int dif)
 531 {
 532         struct sock *sk;
 533
 534         local_bh_disable();
 535         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 536         local_bh_enable();
 537
 538         return sk;
 539 }
 540
 541 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 542
 543 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 544 {
 545         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 546                                           skb->nh.iph->saddr,
 547                                           skb->h.th->dest,
 548                                           skb->h.th->source);
 549 }
 550
 551 /* called with local bh disabled */
 552 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 553                                       struct tcp_tw_bucket **twp)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556         u32 daddr = inet->rcv_saddr;
 557         u32 saddr = inet->daddr;
 558         int dif = sk->sk_bound_dev_if;
 559         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 560         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 561         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 562         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 563         struct sock *sk2;
 564         struct hlist_node *node;
 565         struct tcp_tw_bucket *tw;
 566
 567         write_lock(&head->lock);
 568
 569         /* Check TIME-WAIT sockets first. */
 570         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 571                 tw = (struct tcp_tw_bucket *)sk2;
 572
 573                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 574                         struct tcp_sock *tp = tcp_sk(sk);
 575
 576                         /* With PAWS, it is safe from the viewpoint
 577                            of data integrity. Even without PAWS it
 578                            is safe provided sequence spaces do not
 579                            overlap i.e. at data rates <= 80Mbit/sec.
 580
 581                            Actually, the idea is close to VJ's one,
 582                            only timestamp cache is held not per host,
 583                            but per port pair and TW bucket is used
 584                            as state holder.
 585
 586                            If TW bucket has been already destroyed we
 587                            fall back to VJ's scheme and use initial
 588                            timestamp retrieved from peer table.
 589                          */
 590                         if (tw->tw_ts_recent_stamp &&
 591                             (!twp || (sysctl_tcp_tw_reuse &&
 592                                       xtime.tv_sec -
 593                                       tw->tw_ts_recent_stamp > 1))) {
 594                                 if ((tp->write_seq =
 595                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 596                                         tp->write_seq = 1;
 597                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 598                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 599                                 sock_hold(sk2);
 600                                 goto unique;
 601                         } else
 602                                 goto not_unique;
 603                 }
 604         }
 605         tw = NULL;
 606
 607         /* And established part... */
 608         sk_for_each(sk2, node, &head->chain) {
 609                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 610                         goto not_unique;
 611         }
 612
 613 unique:
 614         /* Must record num and sport now. Otherwise we will see
 615          * in hash table socket with a funny identity. */
 616         inet->num = lport;
 617         inet->sport = htons(lport);
 618         sk->sk_hashent = hash;
 619         BUG_TRAP(sk_unhashed(sk));
 620         __sk_add_node(sk, &head->chain);
 621         sock_prot_inc_use(sk->sk_prot);
 622         write_unlock(&head->lock);
 623
 624         if (twp) {
 625                 *twp = tw;
 626                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 627         } else if (tw) {
 628                 /* Silly. Should hash-dance instead... */
 629                 tcp_tw_deschedule(tw);
 630                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 631
 632                 tcp_tw_put(tw);
 633         }
 634
 635         return 0;
 636
 637 not_unique:
 638         write_unlock(&head->lock);
 639         return -EADDRNOTAVAIL;
 640 }
 641
 642 static inline u32 connect_port_offset(const struct sock *sk)
 643 {
 644         const struct inet_sock *inet = inet_sk(sk);
 645
 646         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 647                                          inet->dport);
 648 }
 649
 650 /*
 651  * Bind a port for a connect operation and hash it.
 652  */
 653 static inline int tcp_v4_hash_connect(struct sock *sk)
 654 {
 655         unsigned short snum = inet_sk(sk)->num;
 656         struct tcp_bind_hashbucket *head;
 657         struct tcp_bind_bucket *tb;
 658         int ret;
 659
 660         if (!snum) {
 661                 int low = sysctl_local_port_range[0];
 662                 int high = sysctl_local_port_range[1];
 663                 int range = high - low;
 664                 int i;
 665                 int port;
 666                 static u32 hint;
 667                 u32 offset = hint + connect_port_offset(sk);
 668                 struct hlist_node *node;
 669                 struct tcp_tw_bucket *tw = NULL;
 670
 671                 local_bh_disable();
 672                 for (i = 1; i <= range; i++) {
 673                         port = low + (i + offset) % range;
 674                         head = &tcp_bhash[tcp_bhashfn(port)];
 675                         spin_lock(&head->lock);
 676
 677                         /* Does not bother with rcv_saddr checks,
 678                          * because the established check is already
 679                          * unique enough.
 680                          */
 681                         tb_for_each(tb, node, &head->chain) {
 682                                 if (tb->port == port) {
 683                                         BUG_TRAP(!hlist_empty(&tb->owners));
 684                                         if (tb->fastreuse >= 0)
 685                                                 goto next_port;
 686                                         if (!__tcp_v4_check_established(sk,
 687                                                                         port,
 688                                                                         &tw))
 689                                                 goto ok;
 690                                         goto next_port;
 691                                 }
 692                         }
 693
 694                         tb = tcp_bucket_create(head, port);
 695                         if (!tb) {
 696                                 spin_unlock(&head->lock);
 697                                 break;
 698                         }
 699                         tb->fastreuse = -1;
 700                         goto ok;
 701
 702                 next_port:
 703                         spin_unlock(&head->lock);
 704                 }
 705                 local_bh_enable();
 706
 707                 return -EADDRNOTAVAIL;
 708
 709 ok:
 710                 hint += i;
 711
 712                 /* Head lock still held and bh's disabled */
 713                 tcp_bind_hash(sk, tb, port);
 714                 if (sk_unhashed(sk)) {
 715                         inet_sk(sk)->sport = htons(port);
 716                         __tcp_v4_hash(sk, 0);
 717                 }
 718                 spin_unlock(&head->lock);
 719
 720                 if (tw) {
 721                         tcp_tw_deschedule(tw);
 722                         tcp_tw_put(tw);
 723                 }
 724
 725                 ret = 0;
 726                 goto out;
 727         }
 728
 729         head  = &tcp_bhash[tcp_bhashfn(snum)];
 730         tb  = tcp_sk(sk)->bind_hash;
 731         spin_lock_bh(&head->lock);
 732         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 733                 __tcp_v4_hash(sk, 0);
 734                 spin_unlock_bh(&head->lock);
 735                 return 0;
 736         } else {
 737                 spin_unlock(&head->lock);
 738                 /* No definite answer... Walk to established hash table */
 739                 ret = __tcp_v4_check_established(sk, snum, NULL);
 740 out:
 741                 local_bh_enable();
 742                 return ret;
 743         }
 744 }
 745
 746 /* This will initiate an outgoing connection. */
 747 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 748 {
 749         struct inet_sock *inet = inet_sk(sk);
 750         struct tcp_sock *tp = tcp_sk(sk);
 751         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 752         struct rtable *rt;
 753         u32 daddr, nexthop;
 754         int tmp;
 755         int err;
 756
 757         if (addr_len < sizeof(struct sockaddr_in))
 758                 return -EINVAL;
 759
 760         if (usin->sin_family != AF_INET)
 761                 return -EAFNOSUPPORT;
 762
 763         nexthop = daddr = usin->sin_addr.s_addr;
 764         if (inet->opt && inet->opt->srr) {
 765                 if (!daddr)
 766                         return -EINVAL;
 767                 nexthop = inet->opt->faddr;
 768         }
 769
 770         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 771                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 772                                IPPROTO_TCP,
 773                                inet->sport, usin->sin_port, sk);
 774         if (tmp < 0)
 775                 return tmp;
 776
 777         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 778                 ip_rt_put(rt);
 779                 return -ENETUNREACH;
 780         }
 781
 782         if (!inet->opt || !inet->opt->srr)
 783                 daddr = rt->rt_dst;
 784
 785         if (!inet->saddr)
 786                 inet->saddr = rt->rt_src;
 787         inet->rcv_saddr = inet->saddr;
 788
 789         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 790                 /* Reset inherited state */
 791                 tp->rx_opt.ts_recent       = 0;
 792                 tp->rx_opt.ts_recent_stamp = 0;
 793                 tp->write_seq              = 0;
 794         }
 795
 796         if (sysctl_tcp_tw_recycle &&
 797             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 798                 struct inet_peer *peer = rt_get_peer(rt);
 799
 800                 /* VJ's idea. We save last timestamp seen from
 801                  * the destination in peer table, when entering state TIME-WAIT
 802                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 803                  */
 804
 805                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 806                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 807                         tp->rx_opt.ts_recent = peer->tcp_ts;
 808                 }
 809         }
 810
 811         inet->dport = usin->sin_port;
 812         inet->daddr = daddr;
 813
 814         tp->ext_header_len = 0;
 815         if (inet->opt)
 816                 tp->ext_header_len = inet->opt->optlen;
 817
 818         tp->rx_opt.mss_clamp = 536;
 819
 820         /* Socket identity is still unknown (sport may be zero).
 821          * However we set state to SYN-SENT and not releasing socket
 822          * lock select source port, enter ourselves into the hash tables and
 823          * complete initialization after this.
 824          */
 825         tcp_set_state(sk, TCP_SYN_SENT);
 826         err = tcp_v4_hash_connect(sk);
 827         if (err)
 828                 goto failure;
 829
 830         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 831         if (err)
 832                 goto failure;
 833
 834         /* OK, now commit destination to socket.  */
 835         __sk_dst_set(sk, &rt->u.dst);
 836         tcp_v4_setup_caps(sk, &rt->u.dst);
 837
 838         if (!tp->write_seq)
 839                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 840                                                            inet->daddr,
 841                                                            inet->sport,
 842                                                            usin->sin_port);
 843
 844         inet->id = tp->write_seq ^ jiffies;
 845
 846         err = tcp_connect(sk);
 847         rt = NULL;
 848         if (err)
 849                 goto failure;
 850
 851         return 0;
 852
 853 failure:
 854         /* This unhashes the socket and releases the local port, if necessary. */
 855         tcp_set_state(sk, TCP_CLOSE);
 856         ip_rt_put(rt);
 857         sk->sk_route_caps = 0;
 858         inet->dport = 0;
 859         return err;
 860 }
 861
 862 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 863 {
 864         return ((struct rtable *)skb->dst)->rt_iif;
 865 }
 866
 867 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 868 {
 869         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 870 }
 871
 872 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 873                                               struct open_request ***prevp,
 874                                               __u16 rport,
 875                                               __u32 raddr, __u32 laddr)
 876 {
 877         struct tcp_listen_opt *lopt = tp->listen_opt;
 878         struct open_request *req, **prev;
 879
 880         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 881              (req = *prev) != NULL;
 882              prev = &req->dl_next) {
 883                 if (req->rmt_port == rport &&
 884                     req->af.v4_req.rmt_addr == raddr &&
 885                     req->af.v4_req.loc_addr == laddr &&
 886                     TCP_INET_FAMILY(req->class->family)) {
 887                         BUG_TRAP(!req->sk);
 888                         *prevp = prev;
 889                         break;
 890                 }
 891         }
 892
 893         return req;
 894 }
 895
 896 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 897 {
 898         struct tcp_sock *tp = tcp_sk(sk);
 899         struct tcp_listen_opt *lopt = tp->listen_opt;
 900         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 901
 902         req->expires = jiffies + TCP_TIMEOUT_INIT;
 903         req->retrans = 0;
 904         req->sk = NULL;
 905         req->dl_next = lopt->syn_table[h];
 906
 907         write_lock(&tp->syn_wait_lock);
 908         lopt->syn_table[h] = req;
 909         write_unlock(&tp->syn_wait_lock);
 910
 911         tcp_synq_added(sk);
 912 }
 913
 914
 915 /*
 916  * This routine does path mtu discovery as defined in RFC1191.
 917  */
 918 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 919                                      u32 mtu)
 920 {
 921         struct dst_entry *dst;
 922         struct inet_sock *inet = inet_sk(sk);
 923         struct tcp_sock *tp = tcp_sk(sk);
 924
 925         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 926          * send out by Linux are always <576bytes so they should go through
 927          * unfragmented).
 928          */
 929         if (sk->sk_state == TCP_LISTEN)
 930                 return;
 931
 932         /* We don't check in the destentry if pmtu discovery is forbidden
 933          * on this route. We just assume that no packet_to_big packets
 934          * are send back when pmtu discovery is not active.
 935          * There is a small race when the user changes this flag in the
 936          * route, but I think that's acceptable.
 937          */
 938         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 939                 return;
 940
 941         dst->ops->update_pmtu(dst, mtu);
 942
 943         /* Something is about to be wrong... Remember soft error
 944          * for the case, if this connection will not able to recover.
 945          */
 946         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 947                 sk->sk_err_soft = EMSGSIZE;
 948
 949         mtu = dst_mtu(dst);
 950
 951         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 952             tp->pmtu_cookie > mtu) {
 953                 tcp_sync_mss(sk, mtu);
 954
 955                 /* Resend the TCP packet because it's
 956                  * clear that the old packet has been
 957                  * dropped. This is the new "fast" path mtu
 958                  * discovery.
 959                  */
 960                 tcp_simple_retransmit(sk);
 961         } /* else let the usual retransmit timer handle it */
 962 }
 963
 964 /*
 965  * This routine is called by the ICMP module when it gets some
 966  * sort of error condition.  If err < 0 then the socket should
 967  * be closed and the error returned to the user.  If err > 0
 968  * it's just the icmp type << 8 | icmp code.  After adjustment
 969  * header points to the first 8 bytes of the tcp header.  We need
 970  * to find the appropriate port.
 971  *
 972  * The locking strategy used here is very "optimistic". When
 973  * someone else accesses the socket the ICMP is just dropped
 974  * and for some paths there is no check at all.
 975  * A more general error queue to queue errors for later handling
 976  * is probably better.
 977  *
 978  */
 979
 980 void tcp_v4_err(struct sk_buff *skb, u32 info)
 981 {
 982         struct iphdr *iph = (struct iphdr *)skb->data;
 983         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 984         struct tcp_sock *tp;
 985         struct inet_sock *inet;
 986         int type = skb->h.icmph->type;
 987         int code = skb->h.icmph->code;
 988         struct sock *sk;
 989         __u32 seq;
 990         int err;
 991
 992         if (skb->len < (iph->ihl << 2) + 8) {
 993                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 994                 return;
 995         }
 996
 997         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 998                            th->source, tcp_v4_iif(skb));
 999         if (!sk) {
1000                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1001                 return;
1002         }
1003         if (sk->sk_state == TCP_TIME_WAIT) {
1004                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1005                 return;
1006         }
1007
1008         bh_lock_sock(sk);
1009         /* If too many ICMPs get dropped on busy
1010          * servers this needs to be solved differently.
1011          */
1012         if (sock_owned_by_user(sk))
1013                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1014
1015         if (sk->sk_state == TCP_CLOSE)
1016                 goto out;
1017
1018         tp = tcp_sk(sk);
1019         seq = ntohl(th->seq);
1020         if (sk->sk_state != TCP_LISTEN &&
1021             !between(seq, tp->snd_una, tp->snd_nxt)) {
1022                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1023                 goto out;
1024         }
1025
1026         switch (type) {
1027         case ICMP_SOURCE_QUENCH:
1028                 /* Just silently ignore these. */
1029                 goto out;
1030         case ICMP_PARAMETERPROB:
1031                 err = EPROTO;
1032                 break;
1033         case ICMP_DEST_UNREACH:
1034                 if (code > NR_ICMP_UNREACH)
1035                         goto out;
1036
1037                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1038                         if (!sock_owned_by_user(sk))
1039                                 do_pmtu_discovery(sk, iph, info);
1040                         goto out;
1041                 }
1042
1043                 err = icmp_err_convert[code].errno;
1044                 break;
1045         case ICMP_TIME_EXCEEDED:
1046                 err = EHOSTUNREACH;
1047                 break;
1048         default:
1049                 goto out;
1050         }
1051
1052         switch (sk->sk_state) {
1053                 struct open_request *req, **prev;
1054         case TCP_LISTEN:
1055                 if (sock_owned_by_user(sk))
1056                         goto out;
1057
1058                 req = tcp_v4_search_req(tp, &prev, th->dest,
1059                                         iph->daddr, iph->saddr);
1060                 if (!req)
1061                         goto out;
1062
1063                 /* ICMPs are not backlogged, hence we cannot get
1064                    an established socket here.
1065                  */
1066                 BUG_TRAP(!req->sk);
1067
1068                 if (seq != req->snt_isn) {
1069                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1070                         goto out;
1071                 }
1072
1073                 /*
1074                  * Still in SYN_RECV, just remove it silently.
1075                  * There is no good way to pass the error to the newly
1076                  * created socket, and POSIX does not want network
1077                  * errors returned from accept().
1078                  */
1079                 tcp_synq_drop(sk, req, prev);
1080                 goto out;
1081
1082         case TCP_SYN_SENT:
1083         case TCP_SYN_RECV:  /* Cannot happen.
1084                                It can f.e. if SYNs crossed.
1085                              */
1086                 if (!sock_owned_by_user(sk)) {
1087                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1088                         sk->sk_err = err;
1089
1090                         sk->sk_error_report(sk);
1091
1092                         tcp_done(sk);
1093                 } else {
1094                         sk->sk_err_soft = err;
1095                 }
1096                 goto out;
1097         }
1098
1099         /* If we've already connected we will keep trying
1100          * until we time out, or the user gives up.
1101          *
1102          * rfc1122 4.2.3.9 allows to consider as hard errors
1103          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1104          * but it is obsoleted by pmtu discovery).
1105          *
1106          * Note, that in modern internet, where routing is unreliable
1107          * and in each dark corner broken firewalls sit, sending random
1108          * errors ordered by their masters even this two messages finally lose
1109          * their original sense (even Linux sends invalid PORT_UNREACHs)
1110          *
1111          * Now we are in compliance with RFCs.
1112          *                                                      --ANK (980905)
1113          */
1114
1115         inet = inet_sk(sk);
1116         if (!sock_owned_by_user(sk) && inet->recverr) {
1117                 sk->sk_err = err;
1118                 sk->sk_error_report(sk);
1119         } else  { /* Only an error on timeout */
1120                 sk->sk_err_soft = err;
1121         }
1122
1123 out:
1124         bh_unlock_sock(sk);
1125         sock_put(sk);
1126 }
1127
1128 /* This routine computes an IPv4 TCP checksum. */
1129 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1130                        struct sk_buff *skb)
1131 {
1132         struct inet_sock *inet = inet_sk(sk);
1133
1134         if (skb->ip_summed == CHECKSUM_HW) {
1135                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1136                 skb->csum = offsetof(struct tcphdr, check);
1137         } else {
1138                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1139                                          csum_partial((char *)th,
1140                                                       th->doff << 2,
1141                                                       skb->csum));
1142         }
1143 }
1144
1145 /*
1146  *      This routine will send an RST to the other tcp.
1147  *
1148  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1149  *                    for reset.
1150  *      Answer: if a packet caused RST, it is not for a socket
1151  *              existing in our system, if it is matched to a socket,
1152  *              it is just duplicate segment or bug in other side's TCP.
1153  *              So that we build reply only basing on parameters
1154  *              arrived with segment.
1155  *      Exception: precedence violation. We do not implement it in any case.
1156  */
1157
1158 static void tcp_v4_send_reset(struct sk_buff *skb)
1159 {
1160         struct tcphdr *th = skb->h.th;
1161         struct tcphdr rth;
1162         struct ip_reply_arg arg;
1163
1164         /* Never send a reset in response to a reset. */
1165         if (th->rst)
1166                 return;
1167
1168         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1169                 return;
1170
1171         /* Swap the send and the receive. */
1172         memset(&rth, 0, sizeof(struct tcphdr));
1173         rth.dest   = th->source;
1174         rth.source = th->dest;
1175         rth.doff   = sizeof(struct tcphdr) / 4;
1176         rth.rst    = 1;
1177
1178         if (th->ack) {
1179                 rth.seq = th->ack_seq;
1180         } else {
1181                 rth.ack = 1;
1182                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1183                                     skb->len - (th->doff << 2));
1184         }
1185
1186         memset(&arg, 0, sizeof arg);
1187         arg.iov[0].iov_base = (unsigned char *)&rth;
1188         arg.iov[0].iov_len  = sizeof rth;
1189         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1190                                       skb->nh.iph->saddr, /*XXX*/
1191                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1192         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1193
1194         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1195
1196         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1197         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1198 }
1199
1200 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1201    outside socket context is ugly, certainly. What can I do?
1202  */
1203
1204 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1205                             u32 win, u32 ts)
1206 {
1207         struct tcphdr *th = skb->h.th;
1208         struct {
1209                 struct tcphdr th;
1210                 u32 tsopt[3];
1211         } rep;
1212         struct ip_reply_arg arg;
1213
1214         memset(&rep.th, 0, sizeof(struct tcphdr));
1215         memset(&arg, 0, sizeof arg);
1216
1217         arg.iov[0].iov_base = (unsigned char *)&rep;
1218         arg.iov[0].iov_len  = sizeof(rep.th);
1219         if (ts) {
1220                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1221                                      (TCPOPT_TIMESTAMP << 8) |
1222                                      TCPOLEN_TIMESTAMP);
1223                 rep.tsopt[1] = htonl(tcp_time_stamp);
1224                 rep.tsopt[2] = htonl(ts);
1225                 arg.iov[0].iov_len = sizeof(rep);
1226         }
1227
1228         /* Swap the send and the receive. */
1229         rep.th.dest    = th->source;
1230         rep.th.source  = th->dest;
1231         rep.th.doff    = arg.iov[0].iov_len / 4;
1232         rep.th.seq     = htonl(seq);
1233         rep.th.ack_seq = htonl(ack);
1234         rep.th.ack     = 1;
1235         rep.th.window  = htons(win);
1236
1237         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1238                                       skb->nh.iph->saddr, /*XXX*/
1239                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1240         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1241
1242         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1243
1244         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1245 }
1246
1247 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1248 {
1249         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1250
1251         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1252                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1253
1254         tcp_tw_put(tw);
1255 }
1256
1257 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1258 {
1259         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1260                         req->ts_recent);
1261 }
1262
1263 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1264                                           struct open_request *req)
1265 {
1266         struct rtable *rt;
1267         struct ip_options *opt = req->af.v4_req.opt;
1268         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269                             .nl_u = { .ip4_u =
1270                                       { .daddr = ((opt && opt->srr) ?
1271                                                   opt->faddr :
1272                                                   req->af.v4_req.rmt_addr),
1273                                         .saddr = req->af.v4_req.loc_addr,
1274                                         .tos = RT_CONN_FLAGS(sk) } },
1275                             .proto = IPPROTO_TCP,
1276                             .uli_u = { .ports =
1277                                        { .sport = inet_sk(sk)->sport,
1278                                          .dport = req->rmt_port } } };
1279
1280         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282                 return NULL;
1283         }
1284         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1285                 ip_rt_put(rt);
1286                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1287                 return NULL;
1288         }
1289         return &rt->u.dst;
1290 }
1291
1292 /*
1293  *      Send a SYN-ACK after having received an ACK.
1294  *      This still operates on a open_request only, not on a big
1295  *      socket.
1296  */
1297 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1298                               struct dst_entry *dst)
1299 {
1300         int err = -1;
1301         struct sk_buff * skb;
1302
1303         /* First, grab a route. */
1304         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1305                 goto out;
1306
1307         skb = tcp_make_synack(sk, dst, req);
1308
1309         if (skb) {
1310                 struct tcphdr *th = skb->h.th;
1311
1312                 th->check = tcp_v4_check(th, skb->len,
1313                                          req->af.v4_req.loc_addr,
1314                                          req->af.v4_req.rmt_addr,
1315                                          csum_partial((char *)th, skb->len,
1316                                                       skb->csum));
1317
1318                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1319                                             req->af.v4_req.rmt_addr,
1320                                             req->af.v4_req.opt);
1321                 if (err == NET_XMIT_CN)
1322                         err = 0;
1323         }
1324
1325 out:
1326         dst_release(dst);
1327         return err;
1328 }
1329
1330 /*
1331  *      IPv4 open_request destructor.
1332  */
1333 static void tcp_v4_or_free(struct open_request *req)
1334 {
1335         if (req->af.v4_req.opt)
1336                 kfree(req->af.v4_req.opt);
1337 }
1338
1339 static inline void syn_flood_warning(struct sk_buff *skb)
1340 {
1341         static unsigned long warntime;
1342
1343         if (time_after(jiffies, (warntime + HZ * 60))) {
1344                 warntime = jiffies;
1345                 printk(KERN_INFO
1346                        "possible SYN flooding on port %d. Sending cookies.\n",
1347                        ntohs(skb->h.th->dest));
1348         }
1349 }
1350
1351 /*
1352  * Save and compile IPv4 options into the open_request if needed.
1353  */
1354 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355                                                      struct sk_buff *skb)
1356 {
1357         struct ip_options *opt = &(IPCB(skb)->opt);
1358         struct ip_options *dopt = NULL;
1359
1360         if (opt && opt->optlen) {
1361                 int opt_size = optlength(opt);
1362                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1363                 if (dopt) {
1364                         if (ip_options_echo(dopt, skb)) {
1365                                 kfree(dopt);
1366                                 dopt = NULL;
1367                         }
1368                 }
1369         }
1370         return dopt;
1371 }
1372
1373 /*
1374  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1375  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1376  * It would be better to replace it with a global counter for all sockets
1377  * but then some measure against one socket starving all other sockets
1378  * would be needed.
1379  *
1380  * It was 128 by default. Experiments with real servers show, that
1381  * it is absolutely not enough even at 100conn/sec. 256 cures most
1382  * of problems. This value is adjusted to 128 for very small machines
1383  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1384  * Further increasing requires to change hash table size.
1385  */
1386 int sysctl_max_syn_backlog = 256;
1387
1388 struct or_calltable or_ipv4 = {
1389         .family         =       PF_INET,
1390         .rtx_syn_ack    =       tcp_v4_send_synack,
1391         .send_ack       =       tcp_v4_or_send_ack,
1392         .destructor     =       tcp_v4_or_free,
1393         .send_reset     =       tcp_v4_send_reset,
1394 };
1395
1396 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1397 {
1398         struct tcp_options_received tmp_opt;
1399         struct open_request *req;
1400         __u32 saddr = skb->nh.iph->saddr;
1401         __u32 daddr = skb->nh.iph->daddr;
1402         __u32 isn = TCP_SKB_CB(skb)->when;
1403         struct dst_entry *dst = NULL;
1404 #ifdef CONFIG_SYN_COOKIES
1405         int want_cookie = 0;
1406 #else
1407 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1408 #endif
1409
1410         /* Never answer to SYNs send to broadcast or multicast */
1411         if (((struct rtable *)skb->dst)->rt_flags &
1412             (RTCF_BROADCAST | RTCF_MULTICAST))
1413                 goto drop;
1414
1415         /* TW buckets are converted to open requests without
1416          * limitations, they conserve resources and peer is
1417          * evidently real one.
1418          */
1419         if (tcp_synq_is_full(sk) && !isn) {
1420 #ifdef CONFIG_SYN_COOKIES
1421                 if (sysctl_tcp_syncookies) {
1422                         want_cookie = 1;
1423                 } else
1424 #endif
1425                 goto drop;
1426         }
1427
1428         /* Accept backlog is full. If we have already queued enough
1429          * of warm entries in syn queue, drop request. It is better than
1430          * clogging syn queue with openreqs with exponentially increasing
1431          * timeout.
1432          */
1433         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1434                 goto drop;
1435
1436         req = tcp_openreq_alloc();
1437         if (!req)
1438                 goto drop;
1439
1440         tcp_clear_options(&tmp_opt);
1441         tmp_opt.mss_clamp = 536;
1442         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1443
1444         tcp_parse_options(skb, &tmp_opt, 0);
1445
1446         if (want_cookie) {
1447                 tcp_clear_options(&tmp_opt);
1448                 tmp_opt.saw_tstamp = 0;
1449         }
1450
1451         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1452                 /* Some OSes (unknown ones, but I see them on web server, which
1453                  * contains information interesting only for windows'
1454                  * users) do not send their stamp in SYN. It is easy case.
1455                  * We simply do not advertise TS support.
1456                  */
1457                 tmp_opt.saw_tstamp = 0;
1458                 tmp_opt.tstamp_ok  = 0;
1459         }
1460         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1461
1462         tcp_openreq_init(req, &tmp_opt, skb);
1463
1464         req->af.v4_req.loc_addr = daddr;
1465         req->af.v4_req.rmt_addr = saddr;
1466         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1467         req->class = &or_ipv4;
1468         if (!want_cookie)
1469                 TCP_ECN_create_request(req, skb->h.th);
1470
1471         if (want_cookie) {
1472 #ifdef CONFIG_SYN_COOKIES
1473                 syn_flood_warning(skb);
1474 #endif
1475                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1476         } else if (!isn) {
1477                 struct inet_peer *peer = NULL;
1478
1479                 /* VJ's idea. We save last timestamp seen
1480                  * from the destination in peer table, when entering
1481                  * state TIME-WAIT, and check against it before
1482                  * accepting new connection request.
1483                  *
1484                  * If "isn" is not zero, this request hit alive
1485                  * timewait bucket, so that all the necessary checks
1486                  * are made in the function processing timewait state.
1487                  */
1488                 if (tmp_opt.saw_tstamp &&
1489                     sysctl_tcp_tw_recycle &&
1490                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1491                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1492                     peer->v4daddr == saddr) {
1493                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1494                             (s32)(peer->tcp_ts - req->ts_recent) >
1495                                                         TCP_PAWS_WINDOW) {
1496                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1497                                 dst_release(dst);
1498                                 goto drop_and_free;
1499                         }
1500                 }
1501                 /* Kill the following clause, if you dislike this way. */
1502                 else if (!sysctl_tcp_syncookies &&
1503                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1504                           (sysctl_max_syn_backlog >> 2)) &&
1505                          (!peer || !peer->tcp_ts_stamp) &&
1506                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1507                         /* Without syncookies last quarter of
1508                          * backlog is filled with destinations,
1509                          * proven to be alive.
1510                          * It means that we continue to communicate
1511                          * to destinations, already remembered
1512                          * to the moment of synflood.
1513                          */
1514                         NETDEBUG(if (net_ratelimit()) \
1515                                         printk(KERN_DEBUG "TCP: drop open "
1516                                                           "request from %u.%u."
1517                                                           "%u.%u/%u\n", \
1518                                                NIPQUAD(saddr),
1519                                                ntohs(skb->h.th->source)));
1520                         dst_release(dst);
1521                         goto drop_and_free;
1522                 }
1523
1524                 isn = tcp_v4_init_sequence(sk, skb);
1525         }
1526         req->snt_isn = isn;
1527
1528         if (tcp_v4_send_synack(sk, req, dst))
1529                 goto drop_and_free;
1530
1531         if (want_cookie) {
1532                 tcp_openreq_free(req);
1533         } else {
1534                 tcp_v4_synq_add(sk, req);
1535         }
1536         return 0;
1537
1538 drop_and_free:
1539         tcp_openreq_free(req);
1540 drop:
1541         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1542         return 0;
1543 }
1544
1545
1546 /*
1547  * The three way handshake has completed - we got a valid synack -
1548  * now create the new socket.
1549  */
1550 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1551                                   struct open_request *req,
1552                                   struct dst_entry *dst)
1553 {
1554         struct inet_sock *newinet;
1555         struct tcp_sock *newtp;
1556         struct sock *newsk;
1557
1558         if (sk_acceptq_is_full(sk))
1559                 goto exit_overflow;
1560
1561         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1562                 goto exit;
1563
1564         newsk = tcp_create_openreq_child(sk, req, skb);
1565         if (!newsk)
1566                 goto exit;
1567
1568         newsk->sk_dst_cache = dst;
1569         tcp_v4_setup_caps(newsk, dst);
1570
1571         newtp                 = tcp_sk(newsk);
1572         newinet               = inet_sk(newsk);
1573         newinet->daddr        = req->af.v4_req.rmt_addr;
1574         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1575         newinet->saddr        = req->af.v4_req.loc_addr;
1576         newinet->opt          = req->af.v4_req.opt;
1577         req->af.v4_req.opt    = NULL;
1578         newinet->mc_index     = tcp_v4_iif(skb);
1579         newinet->mc_ttl       = skb->nh.iph->ttl;
1580         newtp->ext_header_len = 0;
1581         if (newinet->opt)
1582                 newtp->ext_header_len = newinet->opt->optlen;
1583         newinet->id = newtp->write_seq ^ jiffies;
1584
1585         tcp_sync_mss(newsk, dst_mtu(dst));
1586         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1587         tcp_initialize_rcv_mss(newsk);
1588
1589         __tcp_v4_hash(newsk, 0);
1590         __tcp_inherit_port(sk, newsk);
1591
1592         return newsk;
1593
1594 exit_overflow:
1595         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1596 exit:
1597         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1598         dst_release(dst);
1599         return NULL;
1600 }
1601
1602 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1603 {
1604         struct tcphdr *th = skb->h.th;
1605         struct iphdr *iph = skb->nh.iph;
1606         struct tcp_sock *tp = tcp_sk(sk);
1607         struct sock *nsk;
1608         struct open_request **prev;
1609         /* Find possible connection requests. */
1610         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1611                                                      iph->saddr, iph->daddr);
1612         if (req)
1613                 return tcp_check_req(sk, skb, req, prev);
1614
1615         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1616                                           th->source,
1617                                           skb->nh.iph->daddr,
1618                                           ntohs(th->dest),
1619                                           tcp_v4_iif(skb));
1620
1621         if (nsk) {
1622                 if (nsk->sk_state != TCP_TIME_WAIT) {
1623                         bh_lock_sock(nsk);
1624                         return nsk;
1625                 }
1626                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1627                 return NULL;
1628         }
1629
1630 #ifdef CONFIG_SYN_COOKIES
1631         if (!th->rst && !th->syn && th->ack)
1632                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1633 #endif
1634         return sk;
1635 }
1636
1637 static int tcp_v4_checksum_init(struct sk_buff *skb)
1638 {
1639         if (skb->ip_summed == CHECKSUM_HW) {
1640                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1641                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1642                                   skb->nh.iph->daddr, skb->csum))
1643                         return 0;
1644
1645                 NETDEBUG(if (net_ratelimit())
1646                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1647                 skb->ip_summed = CHECKSUM_NONE;
1648         }
1649         if (skb->len <= 76) {
1650                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1651                                  skb->nh.iph->daddr,
1652                                  skb_checksum(skb, 0, skb->len, 0)))
1653                         return -1;
1654                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1655         } else {
1656                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1657                                           skb->nh.iph->saddr,
1658                                           skb->nh.iph->daddr, 0);
1659         }
1660         return 0;
1661 }
1662
1663
1664 /* The socket must have it's spinlock held when we get
1665  * here.
1666  *
1667  * We have a potential double-lock case here, so even when
1668  * doing backlog processing we use the BH locking scheme.
1669  * This is because we cannot sleep with the original spinlock
1670  * held.
1671  */
1672 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1673 {
1674         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1675                 TCP_CHECK_TIMER(sk);
1676                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1677                         goto reset;
1678                 TCP_CHECK_TIMER(sk);
1679                 return 0;
1680         }
1681
1682         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1683                 goto csum_err;
1684
1685         if (sk->sk_state == TCP_LISTEN) {
1686                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1687                 if (!nsk)
1688                         goto discard;
1689
1690                 if (nsk != sk) {
1691                         if (tcp_child_process(sk, nsk, skb))
1692                                 goto reset;
1693                         return 0;
1694                 }
1695         }
1696
1697         TCP_CHECK_TIMER(sk);
1698         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1699                 goto reset;
1700         TCP_CHECK_TIMER(sk);
1701         return 0;
1702
1703 reset:
1704         tcp_v4_send_reset(skb);
1705 discard:
1706         kfree_skb(skb);
1707         /* Be careful here. If this function gets more complicated and
1708          * gcc suffers from register pressure on the x86, sk (in %ebx)
1709          * might be destroyed here. This current version compiles correctly,
1710          * but you have been warned.
1711          */
1712         return 0;
1713
1714 csum_err:
1715         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1716         goto discard;
1717 }
1718
1719 /*
1720  *      From tcp_input.c
1721  */
1722
1723 int tcp_v4_rcv(struct sk_buff *skb)
1724 {
1725         struct tcphdr *th;
1726         struct sock *sk;
1727         int ret;
1728
1729         if (skb->pkt_type != PACKET_HOST)
1730                 goto discard_it;
1731
1732         /* Count it even if it's bad */
1733         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1734
1735         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1736                 goto discard_it;
1737
1738         th = skb->h.th;
1739
1740         if (th->doff < sizeof(struct tcphdr) / 4)
1741                 goto bad_packet;
1742         if (!pskb_may_pull(skb, th->doff * 4))
1743                 goto discard_it;
1744
1745         /* An explanation is required here, I think.
1746          * Packet length and doff are validated by header prediction,
1747          * provided case of th->doff==0 is elimineted.
1748          * So, we defer the checks. */
1749         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1750              tcp_v4_checksum_init(skb) < 0))
1751                 goto bad_packet;
1752
1753         th = skb->h.th;
1754         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1755         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1756                                     skb->len - th->doff * 4);
1757         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1758         TCP_SKB_CB(skb)->when    = 0;
1759         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1760         TCP_SKB_CB(skb)->sacked  = 0;
1761
1762         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1763                              skb->nh.iph->daddr, ntohs(th->dest),
1764                              tcp_v4_iif(skb));
1765
1766         if (!sk)
1767                 goto no_tcp_socket;
1768
1769 process:
1770         if (sk->sk_state == TCP_TIME_WAIT)
1771                 goto do_time_wait;
1772
1773         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1774                 goto discard_and_relse;
1775
1776         if (sk_filter(sk, skb, 0))
1777                 goto discard_and_relse;
1778
1779         skb->dev = NULL;
1780
1781         bh_lock_sock(sk);
1782         ret = 0;
1783         if (!sock_owned_by_user(sk)) {
1784                 if (!tcp_prequeue(sk, skb))
1785                         ret = tcp_v4_do_rcv(sk, skb);
1786         } else
1787                 sk_add_backlog(sk, skb);
1788         bh_unlock_sock(sk);
1789
1790         sock_put(sk);
1791
1792         return ret;
1793
1794 no_tcp_socket:
1795         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1796                 goto discard_it;
1797
1798         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1799 bad_packet:
1800                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1801         } else {
1802                 tcp_v4_send_reset(skb);
1803         }
1804
1805 discard_it:
1806         /* Discard frame. */
1807         kfree_skb(skb);
1808         return 0;
1809
1810 discard_and_relse:
1811         sock_put(sk);
1812         goto discard_it;
1813
1814 do_time_wait:
1815         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1816                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1817                 goto discard_it;
1818         }
1819
1820         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1821                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1822                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1823                 goto discard_it;
1824         }
1825         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1826                                            skb, th, skb->len)) {
1827         case TCP_TW_SYN: {
1828                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1829                                                           ntohs(th->dest),
1830                                                           tcp_v4_iif(skb));
1831                 if (sk2) {
1832                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1833                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1834                         sk = sk2;
1835                         goto process;
1836                 }
1837                 /* Fall through to ACK */
1838         }
1839         case TCP_TW_ACK:
1840                 tcp_v4_timewait_ack(sk, skb);
1841                 break;
1842         case TCP_TW_RST:
1843                 goto no_tcp_socket;
1844         case TCP_TW_SUCCESS:;
1845         }
1846         goto discard_it;
1847 }
1848
1849 /* With per-bucket locks this operation is not-atomic, so that
1850  * this version is not worse.
1851  */
1852 static void __tcp_v4_rehash(struct sock *sk)
1853 {
1854         sk->sk_prot->unhash(sk);
1855         sk->sk_prot->hash(sk);
1856 }
1857
1858 static int tcp_v4_reselect_saddr(struct sock *sk)
1859 {
1860         struct inet_sock *inet = inet_sk(sk);
1861         int err;
1862         struct rtable *rt;
1863         __u32 old_saddr = inet->saddr;
1864         __u32 new_saddr;
1865         __u32 daddr = inet->daddr;
1866
1867         if (inet->opt && inet->opt->srr)
1868                 daddr = inet->opt->faddr;
1869
1870         /* Query new route. */
1871         err = ip_route_connect(&rt, daddr, 0,
1872                                RT_CONN_FLAGS(sk),
1873                                sk->sk_bound_dev_if,
1874                                IPPROTO_TCP,
1875                                inet->sport, inet->dport, sk);
1876         if (err)
1877                 return err;
1878
1879         __sk_dst_set(sk, &rt->u.dst);
1880         tcp_v4_setup_caps(sk, &rt->u.dst);
1881
1882         new_saddr = rt->rt_src;
1883
1884         if (new_saddr == old_saddr)
1885                 return 0;
1886
1887         if (sysctl_ip_dynaddr > 1) {
1888                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1889                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1890                        NIPQUAD(old_saddr),
1891                        NIPQUAD(new_saddr));
1892         }
1893
1894         inet->saddr = new_saddr;
1895         inet->rcv_saddr = new_saddr;
1896
1897         /* XXX The only one ugly spot where we need to
1898          * XXX really change the sockets identity after
1899          * XXX it has entered the hashes. -DaveM
1900          *
1901          * Besides that, it does not check for connection
1902          * uniqueness. Wait for troubles.
1903          */
1904         __tcp_v4_rehash(sk);
1905         return 0;
1906 }
1907
1908 int tcp_v4_rebuild_header(struct sock *sk)
1909 {
1910         struct inet_sock *inet = inet_sk(sk);
1911         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1912         u32 daddr;
1913         int err;
1914
1915         /* Route is OK, nothing to do. */
1916         if (rt)
1917                 return 0;
1918
1919         /* Reroute. */
1920         daddr = inet->daddr;
1921         if (inet->opt && inet->opt->srr)
1922                 daddr = inet->opt->faddr;
1923
1924         {
1925                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1926                                     .nl_u = { .ip4_u =
1927                                               { .daddr = daddr,
1928                                                 .saddr = inet->saddr,
1929                                                 .tos = RT_CONN_FLAGS(sk) } },
1930                                     .proto = IPPROTO_TCP,
1931                                     .uli_u = { .ports =
1932                                                { .sport = inet->sport,
1933                                                  .dport = inet->dport } } };
1934
1935                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1936         }
1937         if (!err) {
1938                 __sk_dst_set(sk, &rt->u.dst);
1939                 tcp_v4_setup_caps(sk, &rt->u.dst);
1940                 return 0;
1941         }
1942
1943         /* Routing failed... */
1944         sk->sk_route_caps = 0;
1945
1946         if (!sysctl_ip_dynaddr ||
1947             sk->sk_state != TCP_SYN_SENT ||
1948             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1949             (err = tcp_v4_reselect_saddr(sk)) != 0)
1950                 sk->sk_err_soft = -err;
1951
1952         return err;
1953 }
1954
1955 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1956 {
1957         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1958         struct inet_sock *inet = inet_sk(sk);
1959
1960         sin->sin_family         = AF_INET;
1961         sin->sin_addr.s_addr    = inet->daddr;
1962         sin->sin_port           = inet->dport;
1963 }
1964
1965 /* VJ's idea. Save last timestamp seen from this destination
1966  * and hold it at least for normal timewait interval to use for duplicate
1967  * segment detection in subsequent connections, before they enter synchronized
1968  * state.
1969  */
1970
1971 int tcp_v4_remember_stamp(struct sock *sk)
1972 {
1973         struct inet_sock *inet = inet_sk(sk);
1974         struct tcp_sock *tp = tcp_sk(sk);
1975         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1976         struct inet_peer *peer = NULL;
1977         int release_it = 0;
1978
1979         if (!rt || rt->rt_dst != inet->daddr) {
1980                 peer = inet_getpeer(inet->daddr, 1);
1981                 release_it = 1;
1982         } else {
1983                 if (!rt->peer)
1984                         rt_bind_peer(rt, 1);
1985                 peer = rt->peer;
1986         }
1987
1988         if (peer) {
1989                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1990                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1991                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1992                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1993                         peer->tcp_ts = tp->rx_opt.ts_recent;
1994                 }
1995                 if (release_it)
1996                         inet_putpeer(peer);
1997                 return 1;
1998         }
1999
2000         return 0;
2001 }
2002
2003 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2004 {
2005         struct inet_peer *peer = NULL;
2006
2007         peer = inet_getpeer(tw->tw_daddr, 1);
2008
2009         if (peer) {
2010                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2011                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2012                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2013                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2014                         peer->tcp_ts = tw->tw_ts_recent;
2015                 }
2016                 inet_putpeer(peer);
2017                 return 1;
2018         }
2019
2020         return 0;
2021 }
2022
2023 struct tcp_func ipv4_specific = {
2024         .queue_xmit     =       ip_queue_xmit,
2025         .send_check     =       tcp_v4_send_check,
2026         .rebuild_header =       tcp_v4_rebuild_header,
2027         .conn_request   =       tcp_v4_conn_request,
2028         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2029         .remember_stamp =       tcp_v4_remember_stamp,
2030         .net_header_len =       sizeof(struct iphdr),
2031         .setsockopt     =       ip_setsockopt,
2032         .getsockopt     =       ip_getsockopt,
2033         .addr2sockaddr  =       v4_addr2sockaddr,
2034         .sockaddr_len   =       sizeof(struct sockaddr_in),
2035 };
2036
2037 /* NOTE: A lot of things set to zero explicitly by call to
2038  *       sk_alloc() so need not be done here.
2039  */
2040 static int tcp_v4_init_sock(struct sock *sk)
2041 {
2042         struct tcp_sock *tp = tcp_sk(sk);
2043
2044         skb_queue_head_init(&tp->out_of_order_queue);
2045         tcp_init_xmit_timers(sk);
2046         tcp_prequeue_init(tp);
2047
2048         tp->rto  = TCP_TIMEOUT_INIT;
2049         tp->mdev = TCP_TIMEOUT_INIT;
2050
2051         /* So many TCP implementations out there (incorrectly) count the
2052          * initial SYN frame in their delayed-ACK and congestion control
2053          * algorithms that we must have the following bandaid to talk
2054          * efficiently to them.  -DaveM
2055          */
2056         tp->snd_cwnd = 2;
2057
2058         /* See draft-stevens-tcpca-spec-01 for discussion of the
2059          * initialization of these values.
2060          */
2061         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2062         tp->snd_cwnd_clamp = ~0;
2063         tp->mss_cache_std = tp->mss_cache = 536;
2064
2065         tp->reordering = sysctl_tcp_reordering;
2066
2067         sk->sk_state = TCP_CLOSE;
2068
2069         sk->sk_write_space = sk_stream_write_space;
2070         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2071
2072         tp->af_specific = &ipv4_specific;
2073
2074         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2075         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2076
2077         atomic_inc(&tcp_sockets_allocated);
2078
2079         return 0;
2080 }
2081
2082 int tcp_v4_destroy_sock(struct sock *sk)
2083 {
2084         struct tcp_sock *tp = tcp_sk(sk);
2085
2086         tcp_clear_xmit_timers(sk);
2087
2088         /* Cleanup up the write buffer. */
2089         sk_stream_writequeue_purge(sk);
2090
2091         /* Cleans up our, hopefully empty, out_of_order_queue. */
2092         __skb_queue_purge(&tp->out_of_order_queue);
2093
2094         /* Clean prequeue, it must be empty really */
2095         __skb_queue_purge(&tp->ucopy.prequeue);
2096
2097         /* Clean up a referenced TCP bind bucket. */
2098         if (tp->bind_hash)
2099                 tcp_put_port(sk);
2100
2101         /*
2102          * If sendmsg cached page exists, toss it.
2103          */
2104         if (sk->sk_sndmsg_page) {
2105                 __free_page(sk->sk_sndmsg_page);
2106                 sk->sk_sndmsg_page = NULL;
2107         }
2108
2109         atomic_dec(&tcp_sockets_allocated);
2110
2111         return 0;
2112 }
2113
2114 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2115
2116 #ifdef CONFIG_PROC_FS
2117 /* Proc filesystem TCP sock list dumping. */
2118
2119 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2120 {
2121         return hlist_empty(head) ? NULL :
2122                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2123 }
2124
2125 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2126 {
2127         return tw->tw_node.next ?
2128                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2129 }
2130
2131 static void *listening_get_next(struct seq_file *seq, void *cur)
2132 {
2133         struct tcp_sock *tp;
2134         struct hlist_node *node;
2135         struct sock *sk = cur;
2136         struct tcp_iter_state* st = seq->private;
2137
2138         if (!sk) {
2139                 st->bucket = 0;
2140                 sk = sk_head(&tcp_listening_hash[0]);
2141                 goto get_sk;
2142         }
2143
2144         ++st->num;
2145
2146         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2147                 struct open_request *req = cur;
2148
2149                 tp = tcp_sk(st->syn_wait_sk);
2150                 req = req->dl_next;
2151                 while (1) {
2152                         while (req) {
2153                                 if (req->class->family == st->family) {
2154                                         cur = req;
2155                                         goto out;
2156                                 }
2157                                 req = req->dl_next;
2158                         }
2159                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2160                                 break;
2161 get_req:
2162                         req = tp->listen_opt->syn_table[st->sbucket];
2163                 }
2164                 sk        = sk_next(st->syn_wait_sk);
2165                 st->state = TCP_SEQ_STATE_LISTENING;
2166                 read_unlock_bh(&tp->syn_wait_lock);
2167         } else {
2168                 tp = tcp_sk(sk);
2169                 read_lock_bh(&tp->syn_wait_lock);
2170                 if (tp->listen_opt && tp->listen_opt->qlen)
2171                         goto start_req;
2172                 read_unlock_bh(&tp->syn_wait_lock);
2173                 sk = sk_next(sk);
2174         }
2175 get_sk:
2176         sk_for_each_from(sk, node) {
2177                 if (sk->sk_family == st->family) {
2178                         cur = sk;
2179                         goto out;
2180                 }
2181                 tp = tcp_sk(sk);
2182                 read_lock_bh(&tp->syn_wait_lock);
2183                 if (tp->listen_opt && tp->listen_opt->qlen) {
2184 start_req:
2185                         st->uid         = sock_i_uid(sk);
2186                         st->syn_wait_sk = sk;
2187                         st->state       = TCP_SEQ_STATE_OPENREQ;
2188                         st->sbucket     = 0;
2189                         goto get_req;
2190                 }
2191                 read_unlock_bh(&tp->syn_wait_lock);
2192         }
2193         if (++st->bucket < TCP_LHTABLE_SIZE) {
2194                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2195                 goto get_sk;
2196         }
2197         cur = NULL;
2198 out:
2199         return cur;
2200 }
2201
2202 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2203 {
2204         void *rc = listening_get_next(seq, NULL);
2205
2206         while (rc && *pos) {
2207                 rc = listening_get_next(seq, rc);
2208                 --*pos;
2209         }
2210         return rc;
2211 }
2212
2213 static void *established_get_first(struct seq_file *seq)
2214 {
2215         struct tcp_iter_state* st = seq->private;
2216         void *rc = NULL;
2217
2218         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2219                 struct sock *sk;
2220                 struct hlist_node *node;
2221                 struct tcp_tw_bucket *tw;
2222
2223                 /* We can reschedule _before_ having picked the target: */
2224                 cond_resched_softirq();
2225
2226                 read_lock(&tcp_ehash[st->bucket].lock);
2227                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2228                         if (sk->sk_family != st->family) {
2229                                 continue;
2230                         }
2231                         rc = sk;
2232                         goto out;
2233                 }
2234                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2235                 tw_for_each(tw, node,
2236                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2237                         if (tw->tw_family != st->family) {
2238                                 continue;
2239                         }
2240                         rc = tw;
2241                         goto out;
2242                 }
2243                 read_unlock(&tcp_ehash[st->bucket].lock);
2244                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2245         }
2246 out:
2247         return rc;
2248 }
2249
2250 static void *established_get_next(struct seq_file *seq, void *cur)
2251 {
2252         struct sock *sk = cur;
2253         struct tcp_tw_bucket *tw;
2254         struct hlist_node *node;
2255         struct tcp_iter_state* st = seq->private;
2256
2257         ++st->num;
2258
2259         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2260                 tw = cur;
2261                 tw = tw_next(tw);
2262 get_tw:
2263                 while (tw && tw->tw_family != st->family) {
2264                         tw = tw_next(tw);
2265                 }
2266                 if (tw) {
2267                         cur = tw;
2268                         goto out;
2269                 }
2270                 read_unlock(&tcp_ehash[st->bucket].lock);
2271                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2272
2273                 /* We can reschedule between buckets: */
2274                 cond_resched_softirq();
2275
2276                 if (++st->bucket < tcp_ehash_size) {
2277                         read_lock(&tcp_ehash[st->bucket].lock);
2278                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2279                 } else {
2280                         cur = NULL;
2281                         goto out;
2282                 }
2283         } else
2284                 sk = sk_next(sk);
2285
2286         sk_for_each_from(sk, node) {
2287                 if (sk->sk_family == st->family)
2288                         goto found;
2289         }
2290
2291         st->state = TCP_SEQ_STATE_TIME_WAIT;
2292         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2293         goto get_tw;
2294 found:
2295         cur = sk;
2296 out:
2297         return cur;
2298 }
2299
2300 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2301 {
2302         void *rc = established_get_first(seq);
2303
2304         while (rc && pos) {
2305                 rc = established_get_next(seq, rc);
2306                 --pos;
2307         }
2308         return rc;
2309 }
2310
2311 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2312 {
2313         void *rc;
2314         struct tcp_iter_state* st = seq->private;
2315
2316         tcp_listen_lock();
2317         st->state = TCP_SEQ_STATE_LISTENING;
2318         rc        = listening_get_idx(seq, &pos);
2319
2320         if (!rc) {
2321                 tcp_listen_unlock();
2322                 local_bh_disable();
2323                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2324                 rc        = established_get_idx(seq, pos);
2325         }
2326
2327         return rc;
2328 }
2329
2330 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2331 {
2332         struct tcp_iter_state* st = seq->private;
2333         st->state = TCP_SEQ_STATE_LISTENING;
2334         st->num = 0;
2335         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2336 }
2337
2338 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2339 {
2340         void *rc = NULL;
2341         struct tcp_iter_state* st;
2342
2343         if (v == SEQ_START_TOKEN) {
2344                 rc = tcp_get_idx(seq, 0);
2345                 goto out;
2346         }
2347         st = seq->private;
2348
2349         switch (st->state) {
2350         case TCP_SEQ_STATE_OPENREQ:
2351         case TCP_SEQ_STATE_LISTENING:
2352                 rc = listening_get_next(seq, v);
2353                 if (!rc) {
2354                         tcp_listen_unlock();
2355                         local_bh_disable();
2356                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2357                         rc        = established_get_first(seq);
2358                 }
2359                 break;
2360         case TCP_SEQ_STATE_ESTABLISHED:
2361         case TCP_SEQ_STATE_TIME_WAIT:
2362                 rc = established_get_next(seq, v);
2363                 break;
2364         }
2365 out:
2366         ++*pos;
2367         return rc;
2368 }
2369
2370 static void tcp_seq_stop(struct seq_file *seq, void *v)
2371 {
2372         struct tcp_iter_state* st = seq->private;
2373
2374         switch (st->state) {
2375         case TCP_SEQ_STATE_OPENREQ:
2376                 if (v) {
2377                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2378                         read_unlock_bh(&tp->syn_wait_lock);
2379                 }
2380         case TCP_SEQ_STATE_LISTENING:
2381                 if (v != SEQ_START_TOKEN)
2382                         tcp_listen_unlock();
2383                 break;
2384         case TCP_SEQ_STATE_TIME_WAIT:
2385         case TCP_SEQ_STATE_ESTABLISHED:
2386                 if (v)
2387                         read_unlock(&tcp_ehash[st->bucket].lock);
2388                 local_bh_enable();
2389                 break;
2390         }
2391 }
2392
2393 static int tcp_seq_open(struct inode *inode, struct file *file)
2394 {
2395         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2396         struct seq_file *seq;
2397         struct tcp_iter_state *s;
2398         int rc;
2399
2400         if (unlikely(afinfo == NULL))
2401                 return -EINVAL;
2402
2403         s = kmalloc(sizeof(*s), GFP_KERNEL);
2404         if (!s)
2405                 return -ENOMEM;
2406         memset(s, 0, sizeof(*s));
2407         s->family               = afinfo->family;
2408         s->seq_ops.start        = tcp_seq_start;
2409         s->seq_ops.next         = tcp_seq_next;
2410         s->seq_ops.show         = afinfo->seq_show;
2411         s->seq_ops.stop         = tcp_seq_stop;
2412
2413         rc = seq_open(file, &s->seq_ops);
2414         if (rc)
2415                 goto out_kfree;
2416         seq          = file->private_data;
2417         seq->private = s;
2418 out:
2419         return rc;
2420 out_kfree:
2421         kfree(s);
2422         goto out;
2423 }
2424
2425 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2426 {
2427         int rc = 0;
2428         struct proc_dir_entry *p;
2429
2430         if (!afinfo)
2431                 return -EINVAL;
2432         afinfo->seq_fops->owner         = afinfo->owner;
2433         afinfo->seq_fops->open          = tcp_seq_open;
2434         afinfo->seq_fops->read          = seq_read;
2435         afinfo->seq_fops->llseek        = seq_lseek;
2436         afinfo->seq_fops->release       = seq_release_private;
2437
2438         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2439         if (p)
2440                 p->data = afinfo;
2441         else
2442                 rc = -ENOMEM;
2443         return rc;
2444 }
2445
2446 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2447 {
2448         if (!afinfo)
2449                 return;
2450         proc_net_remove(afinfo->name);
2451         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2452 }
2453
2454 static void get_openreq4(struct sock *sk, struct open_request *req,
2455                          char *tmpbuf, int i, int uid)
2456 {
2457         int ttd = req->expires - jiffies;
2458
2459         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2460                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2461                 i,
2462                 req->af.v4_req.loc_addr,
2463                 ntohs(inet_sk(sk)->sport),
2464                 req->af.v4_req.rmt_addr,
2465                 ntohs(req->rmt_port),
2466                 TCP_SYN_RECV,
2467                 0, 0, /* could print option size, but that is af dependent. */
2468                 1,    /* timers active (only the expire timer) */
2469                 jiffies_to_clock_t(ttd),
2470                 req->retrans,
2471                 uid,
2472                 0,  /* non standard timer */
2473                 0, /* open_requests have no inode */
2474                 atomic_read(&sk->sk_refcnt),
2475                 req);
2476 }
2477
2478 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2479 {
2480         int timer_active;
2481         unsigned long timer_expires;
2482         struct tcp_sock *tp = tcp_sk(sp);
2483         struct inet_sock *inet = inet_sk(sp);
2484         unsigned int dest = inet->daddr;
2485         unsigned int src = inet->rcv_saddr;
2486         __u16 destp = ntohs(inet->dport);
2487         __u16 srcp = ntohs(inet->sport);
2488
2489         if (tp->pending == TCP_TIME_RETRANS) {
2490                 timer_active    = 1;
2491                 timer_expires   = tp->timeout;
2492         } else if (tp->pending == TCP_TIME_PROBE0) {
2493                 timer_active    = 4;
2494                 timer_expires   = tp->timeout;
2495         } else if (timer_pending(&sp->sk_timer)) {
2496                 timer_active    = 2;
2497                 timer_expires   = sp->sk_timer.expires;
2498         } else {
2499                 timer_active    = 0;
2500                 timer_expires = jiffies;
2501         }
2502
2503         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2504                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2505                 i, src, srcp, dest, destp, sp->sk_state,
2506                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2507                 timer_active,
2508                 jiffies_to_clock_t(timer_expires - jiffies),
2509                 tp->retransmits,
2510                 sock_i_uid(sp),
2511                 tp->probes_out,
2512                 sock_i_ino(sp),
2513                 atomic_read(&sp->sk_refcnt), sp,
2514                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2515                 tp->snd_cwnd,
2516                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2517 }
2518
2519 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2520 {
2521         unsigned int dest, src;
2522         __u16 destp, srcp;
2523         int ttd = tw->tw_ttd - jiffies;
2524
2525         if (ttd < 0)
2526                 ttd = 0;
2527
2528         dest  = tw->tw_daddr;
2529         src   = tw->tw_rcv_saddr;
2530         destp = ntohs(tw->tw_dport);
2531         srcp  = ntohs(tw->tw_sport);
2532
2533         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2534                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2535                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2536                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2537                 atomic_read(&tw->tw_refcnt), tw);
2538 }
2539
2540 #define TMPSZ 150
2541
2542 static int tcp4_seq_show(struct seq_file *seq, void *v)
2543 {
2544         struct tcp_iter_state* st;
2545         char tmpbuf[TMPSZ + 1];
2546
2547         if (v == SEQ_START_TOKEN) {
2548                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2549                            "  sl  local_address rem_address   st tx_queue "
2550                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2551                            "inode");
2552                 goto out;
2553         }
2554         st = seq->private;
2555
2556         switch (st->state) {
2557         case TCP_SEQ_STATE_LISTENING:
2558         case TCP_SEQ_STATE_ESTABLISHED:
2559                 get_tcp4_sock(v, tmpbuf, st->num);
2560                 break;
2561         case TCP_SEQ_STATE_OPENREQ:
2562                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2563                 break;
2564         case TCP_SEQ_STATE_TIME_WAIT:
2565                 get_timewait4_sock(v, tmpbuf, st->num);
2566                 break;
2567         }
2568         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2569 out:
2570         return 0;
2571 }
2572
2573 static struct file_operations tcp4_seq_fops;
2574 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2575         .owner          = THIS_MODULE,
2576         .name           = "tcp",
2577         .family         = AF_INET,
2578         .seq_show       = tcp4_seq_show,
2579         .seq_fops       = &tcp4_seq_fops,
2580 };
2581
2582 int __init tcp4_proc_init(void)
2583 {
2584         return tcp_proc_register(&tcp4_seq_afinfo);
2585 }
2586
2587 void tcp4_proc_exit(void)
2588 {
2589         tcp_proc_unregister(&tcp4_seq_afinfo);
2590 }
2591 #endif /* CONFIG_PROC_FS */
2592
2593 struct proto tcp_prot = {
2594         .name                   = "TCP",
2595         .owner                  = THIS_MODULE,
2596         .close                  = tcp_close,
2597         .connect                = tcp_v4_connect,
2598         .disconnect             = tcp_disconnect,
2599         .accept                 = tcp_accept,
2600         .ioctl                  = tcp_ioctl,
2601         .init                   = tcp_v4_init_sock,
2602         .destroy                = tcp_v4_destroy_sock,
2603         .shutdown               = tcp_shutdown,
2604         .setsockopt             = tcp_setsockopt,
2605         .getsockopt             = tcp_getsockopt,
2606         .sendmsg                = tcp_sendmsg,
2607         .recvmsg                = tcp_recvmsg,
2608         .backlog_rcv            = tcp_v4_do_rcv,
2609         .hash                   = tcp_v4_hash,
2610         .unhash                 = tcp_unhash,
2611         .get_port               = tcp_v4_get_port,
2612         .enter_memory_pressure  = tcp_enter_memory_pressure,
2613         .sockets_allocated      = &tcp_sockets_allocated,
2614         .memory_allocated       = &tcp_memory_allocated,
2615         .memory_pressure        = &tcp_memory_pressure,
2616         .sysctl_mem             = sysctl_tcp_mem,
2617         .sysctl_wmem            = sysctl_tcp_wmem,
2618         .sysctl_rmem            = sysctl_tcp_rmem,
2619         .max_header             = MAX_TCP_HEADER,
2620         .obj_size               = sizeof(struct tcp_sock),
2621 };
2622
2623
2624
2625 void __init tcp_v4_init(struct net_proto_family *ops)
2626 {
2627         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2628         if (err < 0)
2629                 panic("Failed to create the TCP control socket.\n");
2630         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2631         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2632
2633         /* Unhash it so that IP input processing does not even
2634          * see it, we do not wish this socket to see incoming
2635          * packets.
2636          */
2637         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2638 }
2639
2640 EXPORT_SYMBOL(ipv4_specific);
2641 EXPORT_SYMBOL(tcp_bind_hash);
2642 EXPORT_SYMBOL(tcp_bucket_create);
2643 EXPORT_SYMBOL(tcp_hashinfo);
2644 EXPORT_SYMBOL(tcp_inherit_port);
2645 EXPORT_SYMBOL(tcp_listen_wlock);
2646 EXPORT_SYMBOL(tcp_port_rover);
2647 EXPORT_SYMBOL(tcp_prot);
2648 EXPORT_SYMBOL(tcp_put_port);
2649 EXPORT_SYMBOL(tcp_unhash);
2650 EXPORT_SYMBOL(tcp_v4_conn_request);
2651 EXPORT_SYMBOL(tcp_v4_connect);
2652 EXPORT_SYMBOL(tcp_v4_do_rcv);
2653 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2654 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2655 EXPORT_SYMBOL(tcp_v4_send_check);
2656 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2657
2658 #ifdef CONFIG_PROC_FS
2659 EXPORT_SYMBOL(tcp_proc_register);
2660 EXPORT_SYMBOL(tcp_proc_unregister);
2661 #endif
2662 EXPORT_SYMBOL(sysctl_local_port_range);
2663 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2664 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2665 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2666