net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.164 1999/01/04 20:36:55 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/init.h>
  55 #include <linux/ipsec.h>
  56
  57 #include <net/icmp.h>
  58 #include <net/tcp.h>
  59 #include <net/ipv6.h>
  60
  61 #include <asm/segment.h>
  62
  63 #include <linux/inet.h>
  64 #include <linux/stddef.h>
  65
  66 extern int sysctl_tcp_timestamps;
  67 extern int sysctl_tcp_window_scaling;
  68 extern int sysctl_tcp_sack;
  69 extern int sysctl_tcp_syncookies;
  70 extern int sysctl_ip_dynaddr;
  71 extern __u32 sysctl_wmem_max;
  72 extern __u32 sysctl_rmem_max;
  73
  74 /* Check TCP sequence numbers in ICMP packets. */
  75 #define ICMP_MIN_LENGTH 8
  76
  77 /* Socket used for sending RSTs */
  78 struct inode tcp_inode;
  79 struct socket *tcp_socket=&tcp_inode.u.socket_i;
  80
  81 static void tcp_v4_send_reset(struct sk_buff *skb);
  82
  83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  84                        struct sk_buff *skb);
  85
  86 /* This is for sockets with full identity only.  Sockets here will always
  87  * be without wildcards and will have the following invariant:
  88  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  89  *
  90  * First half of the table is for sockets not in TIME_WAIT, second half
  91  * is for TIME_WAIT sockets only.
  92  */
  93 struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
  94
  95 /* Ok, let's try this, I give up, we do need a local binding
  96  * TCP hash as well as the others for fast bind/connect.
  97  */
  98 struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
  99
 100 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
 101  * where wildcard'd TCP sockets can exist.  Hash function here is just local
 102  * port number.
 103  */
 104 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
 105
 106 /* Register cache. */
 107 struct sock *tcp_regs[TCP_NUM_REGS];
 108
 109 /*
 110  * This array holds the first and last local port number.
 111  * For high-usage systems, use sysctl to change this to
 112  * 32768-61000
 113  */
 114 int sysctl_local_port_range[2] = { 1024, 4999 };
 115 int tcp_port_rover = (1024 - 1);
 116
 117 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 118                                  __u32 faddr, __u16 fport)
 119 {
 120         return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
 121 }
 122
 123 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 124 {
 125         __u32 laddr = sk->rcv_saddr;
 126         __u16 lport = sk->num;
 127         __u32 faddr = sk->daddr;
 128         __u16 fport = sk->dport;
 129
 130         return tcp_hashfn(laddr, lport, faddr, fport);
 131 }
 132
 133 /* Invariant, sk->num is non-zero. */
 134 void tcp_bucket_unlock(struct sock *sk)
 135 {
 136         struct tcp_bind_bucket *tb;
 137         unsigned short snum = sk->num;
 138
 139         SOCKHASH_LOCK();
 140         for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
 141                 if(tb->port == snum) {
 142                         if(tb->owners == NULL &&
 143                            (tb->flags & TCPB_FLAG_LOCKED)) {
 144                                 tb->flags &= ~(TCPB_FLAG_LOCKED |
 145                                                TCPB_FLAG_FASTREUSE);
 146                                 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 147                         }
 148                         break;
 149                 }
 150         }
 151         SOCKHASH_UNLOCK();
 152 }
 153
 154 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
 155 {
 156         struct tcp_bind_bucket *tb;
 157
 158         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 159         if(tb != NULL) {
 160                 struct tcp_bind_bucket **head =
 161                         &tcp_bound_hash[tcp_bhashfn(snum)];
 162                 tb->port = snum;
 163                 tb->flags = TCPB_FLAG_LOCKED;
 164                 tb->owners = NULL;
 165                 if((tb->next = *head) != NULL)
 166                         tb->next->pprev = &tb->next;
 167                 *head = tb;
 168                 tb->pprev = head;
 169         }
 170         return tb;
 171 }
 172
 173 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 174 /* Ensure that the bound bucket for the port exists.
 175  * Return 0 on success.
 176  */
 177 static __inline__ int tcp_bucket_check(unsigned short snum)
 178 {
 179         struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
 180         for( ; (tb && (tb->port != snum)); tb = tb->next)
 181                 ;
 182         if(tb == NULL && tcp_bucket_create(snum) == NULL)
 183                 return 1;
 184         else
 185                 return 0;
 186 }
 187 #endif
 188
 189 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 190 {
 191         struct tcp_bind_bucket *tb;
 192         int result = 0;
 193
 194         SOCKHASH_LOCK();
 195         for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
 196             (tb && (tb->port != snum));
 197             tb = tb->next)
 198                 ;
 199         if(tb && tb->owners) {
 200                 /* Fast path for reuse ports, see include/net/tcp.h for a very
 201                  * detailed description of why this works, and why it is worth
 202                  * the effort at all. -DaveM
 203                  */
 204                 if((tb->flags & TCPB_FLAG_FASTREUSE)    &&
 205                    (sk->reuse != 0)) {
 206                         goto go_like_smoke;
 207                 } else {
 208                         struct sock *sk2;
 209                         int sk_reuse = sk->reuse;
 210
 211                         /* We must walk the whole port owner list in this case. -DaveM */
 212                         for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
 213                                 if (sk->bound_dev_if == sk2->bound_dev_if) {
 214                                         if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
 215                                                 if(!sk2->rcv_saddr              ||
 216                                                    !sk->rcv_saddr               ||
 217                                                    (sk2->rcv_saddr == sk->rcv_saddr))
 218                                                         break;
 219                                         }
 220                                 }
 221                         }
 222                         if(sk2 != NULL)
 223                                 result = 1;
 224                 }
 225         }
 226         if(result == 0) {
 227                 if(tb == NULL) {
 228                         if((tb = tcp_bucket_create(snum)) == NULL)
 229                                 result = 1;
 230                         else if (sk->reuse && sk->state != TCP_LISTEN)
 231                                 tb->flags |= TCPB_FLAG_FASTREUSE;
 232                 } else {
 233                         /* It could be pending garbage collection, this
 234                          * kills the race and prevents it from disappearing
 235                          * out from under us by the time we use it.  -DaveM
 236                          */
 237                         if(tb->owners == NULL) {
 238                                 if (!(tb->flags & TCPB_FLAG_LOCKED)) {
 239                                         tb->flags = (TCPB_FLAG_LOCKED |
 240                                                      ((sk->reuse &&
 241                                                        sk->state != TCP_LISTEN) ?
 242                                                       TCPB_FLAG_FASTREUSE : 0));
 243                                         tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
 244                                 } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
 245                                         /* Someone is in between the bind
 246                                          * and the actual connect or listen.
 247                                          * See if it was a legitimate reuse
 248                                          * and we are as well, else punt.
 249                                          */
 250                                         if (sk->reuse == 0 ||
 251                                             !(tb->flags & TCPB_FLAG_FASTREUSE))
 252                                                 result = 1;
 253                                 } else
 254                                         tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
 255                         }
 256                 }
 257         }
 258 go_like_smoke:
 259         SOCKHASH_UNLOCK();
 260         return result;
 261 }
 262
 263 unsigned short tcp_good_socknum(void)
 264 {
 265         struct tcp_bind_bucket *tb;
 266         int low = sysctl_local_port_range[0];
 267         int high = sysctl_local_port_range[1];
 268         int remaining = (high - low) + 1;
 269         int rover;
 270
 271         SOCKHASH_LOCK();
 272         rover = tcp_port_rover;
 273         do {
 274                 rover += 1;
 275                 if((rover < low) || (rover > high))
 276                         rover = low;
 277                 tb = tcp_bound_hash[tcp_bhashfn(rover)];
 278                 for( ; tb; tb = tb->next) {
 279                         if(tb->port == rover)
 280                                 goto next;
 281                 }
 282                 break;
 283         next:
 284         } while(--remaining > 0);
 285         tcp_port_rover = rover;
 286         tb = NULL;
 287         if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
 288                 rover = 0;
 289         if (tb != NULL)
 290                 tb->flags |= TCPB_FLAG_GOODSOCKNUM;
 291         SOCKHASH_UNLOCK();
 292
 293         return rover;
 294 }
 295
 296 static void tcp_v4_hash(struct sock *sk)
 297 {
 298         if (sk->state != TCP_CLOSE) {
 299                 struct sock **skp;
 300
 301                 SOCKHASH_LOCK();
 302                 skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
 303                 if((sk->next = *skp) != NULL)
 304                         (*skp)->pprev = &sk->next;
 305                 *skp = sk;
 306                 sk->pprev = skp;
 307                 tcp_sk_bindify(sk);
 308                 SOCKHASH_UNLOCK();
 309         }
 310 }
 311
 312 static void tcp_v4_unhash(struct sock *sk)
 313 {
 314         SOCKHASH_LOCK();
 315         if(sk->pprev) {
 316                 if(sk->next)
 317                         sk->next->pprev = sk->pprev;
 318                 *sk->pprev = sk->next;
 319                 sk->pprev = NULL;
 320                 tcp_reg_zap(sk);
 321                 tcp_sk_unbindify(sk);
 322         }
 323         SOCKHASH_UNLOCK();
 324 }
 325
 326 static void tcp_v4_rehash(struct sock *sk)
 327 {
 328         unsigned char state;
 329
 330         SOCKHASH_LOCK();
 331         state = sk->state;
 332         if(sk->pprev != NULL) {
 333                 if(sk->next)
 334                         sk->next->pprev = sk->pprev;
 335                 *sk->pprev = sk->next;
 336                 sk->pprev = NULL;
 337                 tcp_reg_zap(sk);
 338         }
 339         if(state != TCP_CLOSE) {
 340                 struct sock **skp;
 341
 342                 if(state == TCP_LISTEN)
 343                         skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 344                 else
 345                         skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
 346
 347                 if((sk->next = *skp) != NULL)
 348                         (*skp)->pprev = &sk->next;
 349                 *skp = sk;
 350                 sk->pprev = skp;
 351                 if(state == TCP_LISTEN)
 352                         tcp_sk_bindify(sk);
 353         }
 354         SOCKHASH_UNLOCK();
 355 }
 356
 357 /* Don't inline this cruft.  Here are some nice properties to
 358  * exploit here.  The BSD API does not allow a listening TCP
 359  * to specify the remote port nor the remote address for the
 360  * connection.  So always assume those are both wildcarded
 361  * during the search since they can never be otherwise.
 362  */
 363 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 364 {
 365         struct sock *sk;
 366         struct sock *result = NULL;
 367         int score, hiscore;
 368
 369         hiscore=0;
 370         for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
 371                 if(sk->num == hnum) {
 372                         __u32 rcv_saddr = sk->rcv_saddr;
 373
 374                         score = 1;
 375                         if(rcv_saddr) {
 376                                 if (rcv_saddr != daddr)
 377                                         continue;
 378                                 score++;
 379                         }
 380                         if (sk->bound_dev_if) {
 381                                 if (sk->bound_dev_if != dif)
 382                                         continue;
 383                                 score++;
 384                         }
 385                         if (score == 3)
 386                                 return sk;
 387                         if (score > hiscore) {
 388                                 hiscore = score;
 389                                 result = sk;
 390                         }
 391                 }
 392         }
 393         return result;
 394 }
 395
 396 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 397  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 398  * It is assumed that this code only gets called from within NET_BH.
 399  */
 400 static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
 401                                            u32 saddr, u16 sport,
 402                                            u32 daddr, u16 dport, int dif)
 403 {
 404         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 405         __u16 hnum = ntohs(dport);
 406         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 407         struct sock *sk;
 408         int hash;
 409
 410         /* Check TCP register quick cache first. */
 411         sk = TCP_RHASH(sport);
 412         if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 413                 goto hit;
 414
 415         /* Optimize here for direct hit, only listening connections can
 416          * have wildcards anyways.
 417          */
 418         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 419         for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
 420                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
 421                         if (sk->state == TCP_ESTABLISHED)
 422                                 TCP_RHASH(sport) = sk;
 423                         goto hit; /* You sunk my battleship! */
 424                 }
 425         }
 426         /* Must check for a TIME_WAIT'er before going to listener hash. */
 427         for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
 428                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 429                         goto hit;
 430         sk = tcp_v4_lookup_listener(daddr, hnum, dif);
 431 hit:
 432         return sk;
 433 }
 434
 435 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 436 {
 437         return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
 438 }
 439
 440 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 441 /* Cleaned up a little and adapted to new bind bucket scheme.
 442  * Oddly, this should increase performance here for
 443  * transparent proxy, as tests within the inner loop have
 444  * been eliminated. -DaveM
 445  */
 446 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 447                                         unsigned short rnum, unsigned long laddr,
 448                                         struct device *dev, unsigned short pnum,
 449                                         int dif)
 450 {
 451         struct sock *s, *result = NULL;
 452         int badness = -1;
 453         u32 paddr = 0;
 454         unsigned short hnum = ntohs(num);
 455         unsigned short hpnum = ntohs(pnum);
 456         int firstpass = 1;
 457
 458         if(dev && dev->ip_ptr) {
 459                 struct in_device *idev = dev->ip_ptr;
 460
 461                 if(idev->ifa_list)
 462                         paddr = idev->ifa_list->ifa_local;
 463         }
 464
 465         /* This code must run only from NET_BH. */
 466         {
 467                 struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
 468                 for( ; (tb && tb->port != hnum); tb = tb->next)
 469                         ;
 470                 if(tb == NULL)
 471                         goto next;
 472                 s = tb->owners;
 473         }
 474 pass2:
 475         for(; s; s = s->bind_next) {
 476                 int score = 0;
 477                 if(s->rcv_saddr) {
 478                         if((s->num != hpnum || s->rcv_saddr != paddr) &&
 479                            (s->num != hnum || s->rcv_saddr != laddr))
 480                                 continue;
 481                         score++;
 482                 }
 483                 if(s->daddr) {
 484                         if(s->daddr != raddr)
 485                                 continue;
 486                         score++;
 487                 }
 488                 if(s->dport) {
 489                         if(s->dport != rnum)
 490                                 continue;
 491                         score++;
 492                 }
 493                 if(s->bound_dev_if) {
 494                         if(s->bound_dev_if != dif)
 495                                 continue;
 496                         score++;
 497                 }
 498                 if(score == 4 && s->num == hnum) {
 499                         result = s;
 500                         goto gotit;
 501                 } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
 502                         result = s;
 503                         badness = score;
 504                 }
 505         }
 506 next:
 507         if(firstpass--) {
 508                 struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
 509                 for( ; (tb && tb->port != hpnum); tb = tb->next)
 510                         ;
 511                 if(tb) {
 512                         s = tb->owners;
 513                         goto pass2;
 514                 }
 515         }
 516 gotit:
 517         return result;
 518 }
 519 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
 520
 521 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 522 {
 523         return secure_tcp_sequence_number(sk->saddr, sk->daddr,
 524                                           skb->h.th->dest,
 525                                           skb->h.th->source);
 526 }
 527
 528 /* Check that a TCP address is unique, don't allow multiple
 529  * connects to/from the same address.  Actually we can optimize
 530  * quite a bit, since the socket about to connect is still
 531  * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
 532  * use will exist, with a NULL owners list.  So check for that.
 533  * The good_socknum and verify_bind scheme we use makes this
 534  * work.
 535  */
 536 static int tcp_v4_unique_address(struct sock *sk)
 537 {
 538         struct tcp_bind_bucket *tb;
 539         unsigned short snum = sk->num;
 540         int retval = 1;
 541
 542         /* Freeze the hash while we snoop around. */
 543         SOCKHASH_LOCK();
 544         tb = tcp_bound_hash[tcp_bhashfn(snum)];
 545         for(; tb; tb = tb->next) {
 546                 if(tb->port == snum && tb->owners != NULL) {
 547                         /* Almost certainly the re-use port case, search the real hashes
 548                          * so it actually scales.
 549                          */
 550                         sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
 551                                              sk->rcv_saddr, snum, sk->bound_dev_if);
 552                         if((sk != NULL) && (sk->state != TCP_LISTEN))
 553                                 retval = 0;
 554                         break;
 555                 }
 556         }
 557         SOCKHASH_UNLOCK();
 558         return retval;
 559 }
 560
 561 /* This will initiate an outgoing connection. */
 562 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 563 {
 564         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 565         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 566         struct sk_buff *buff;
 567         struct rtable *rt;
 568         u32 daddr, nexthop;
 569         int tmp;
 570
 571         if (sk->state != TCP_CLOSE)
 572                 return(-EISCONN);
 573
 574         /* Don't allow a double connect. */
 575         if (sk->daddr)
 576                 return -EINVAL;
 577
 578         if (addr_len < sizeof(struct sockaddr_in))
 579                 return(-EINVAL);
 580
 581         if (usin->sin_family != AF_INET) {
 582                 static int complained;
 583                 if (usin->sin_family)
 584                         return(-EAFNOSUPPORT);
 585                 if (!complained++)
 586                         printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
 587         }
 588
 589         nexthop = daddr = usin->sin_addr.s_addr;
 590         if (sk->opt && sk->opt->srr) {
 591                 if (daddr == 0)
 592                         return -EINVAL;
 593                 nexthop = sk->opt->faddr;
 594         }
 595
 596         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 597                                RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 598         if (tmp < 0)
 599                 return tmp;
 600
 601         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 602                 ip_rt_put(rt);
 603                 return -ENETUNREACH;
 604         }
 605
 606         dst_release(xchg(&sk->dst_cache, rt));
 607
 608         buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 609                             0, GFP_KERNEL);
 610
 611         if (buff == NULL)
 612                 return -ENOBUFS;
 613
 614         /* Socket has no identity, so lock_sock() is useless.  Also
 615          * since state==TCP_CLOSE (checked above) the socket cannot
 616          * possibly be in the hashes.  TCP hash locking is only
 617          * needed while checking quickly for a unique address.
 618          * However, the socket does need to be (and is) locked
 619          * in tcp_connect().
 620          * Perhaps this addresses all of ANK's concerns. 8-)  -DaveM
 621          */
 622         sk->dport = usin->sin_port;
 623         sk->daddr = rt->rt_dst;
 624         if (sk->opt && sk->opt->srr)
 625                 sk->daddr = daddr;
 626         if (!sk->saddr)
 627                 sk->saddr = rt->rt_src;
 628         sk->rcv_saddr = sk->saddr;
 629
 630         if (!tcp_v4_unique_address(sk)) {
 631                 kfree_skb(buff);
 632                 return -EADDRNOTAVAIL;
 633         }
 634
 635         tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 636                                                    sk->sport, usin->sin_port);
 637
 638         tp->ext_header_len = 0;
 639         if (sk->opt)
 640                 tp->ext_header_len = sk->opt->optlen;
 641
 642         /* Reset mss clamp */
 643         tp->mss_clamp = ~0;
 644
 645         if (!ip_dont_fragment(sk, &rt->u.dst) &&
 646             rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
 647                 /* Clamp mss at maximum of 536 and user_mss.
 648                    Probably, user ordered to override tiny segment size
 649                    in gatewayed case.
 650                  */
 651                 tp->mss_clamp = max(tp->user_mss, 536);
 652         }
 653
 654         tcp_connect(sk, buff, rt->u.dst.pmtu);
 655         return 0;
 656 }
 657
 658 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 659 {
 660         struct tcp_opt *tp;
 661         int retval = -EINVAL;
 662
 663         /* Do sanity checking for sendmsg/sendto/send. */
 664         if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
 665                 goto out;
 666         if (msg->msg_name) {
 667                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
 668
 669                 if (msg->msg_namelen < sizeof(*addr))
 670                         goto out;
 671                 if (addr->sin_family && addr->sin_family != AF_INET)
 672                         goto out;
 673                 retval = -ENOTCONN;
 674                 if(sk->state == TCP_CLOSE)
 675                         goto out;
 676                 retval = -EISCONN;
 677                 if (addr->sin_port != sk->dport)
 678                         goto out;
 679                 if (addr->sin_addr.s_addr != sk->daddr)
 680                         goto out;
 681         }
 682
 683         lock_sock(sk);
 684         retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
 685                                 msg->msg_flags);
 686         /* Push out partial tail frames if needed. */
 687         tp = &(sk->tp_pinfo.af_tcp);
 688         if(tp->send_head && tcp_snd_test(sk, tp->send_head))
 689                 tcp_write_xmit(sk);
 690         release_sock(sk);
 691
 692 out:
 693         return retval;
 694 }
 695
 696
 697 /*
 698  * Do a linear search in the socket open_request list.
 699  * This should be replaced with a global hash table.
 700  */
 701 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 702                                               struct iphdr *iph,
 703                                               struct tcphdr *th,
 704                                               struct open_request **prevp)
 705 {
 706         struct open_request *req, *prev;
 707         __u16 rport = th->source;
 708
 709         /*      assumption: the socket is not in use.
 710          *      as we checked the user count on tcp_rcv and we're
 711          *      running from a soft interrupt.
 712          */
 713         prev = (struct open_request *) (&tp->syn_wait_queue);
 714         for (req = prev->dl_next; req; req = req->dl_next) {
 715                 if (req->af.v4_req.rmt_addr == iph->saddr &&
 716                     req->af.v4_req.loc_addr == iph->daddr &&
 717                     req->rmt_port == rport
 718 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 719                     && req->lcl_port == th->dest
 720 #endif
 721                     ) {
 722                         *prevp = prev;
 723                         return req;
 724                 }
 725                 prev = req;
 726         }
 727         return NULL;
 728 }
 729
 730
 731 /*
 732  * This routine does path mtu discovery as defined in RFC1191.
 733  */
 734 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 735 {
 736         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 737
 738         /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
 739          * send out by Linux are always <576bytes so they should go through
 740          * unfragmented).
 741          */
 742         if (sk->state == TCP_LISTEN)
 743                 return;
 744
 745         /* We don't check in the destentry if pmtu discovery is forbidden
 746          * on this route. We just assume that no packet_to_big packets
 747          * are send back when pmtu discovery is not active.
 748          * There is a small race when the user changes this flag in the
 749          * route, but I think that's acceptable.
 750          */
 751         if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
 752                 if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
 753                     !atomic_read(&sk->sock_readers)) {
 754                         lock_sock(sk);
 755                         tcp_sync_mss(sk, sk->dst_cache->pmtu);
 756
 757                         /* Resend the TCP packet because it's
 758                          * clear that the old packet has been
 759                          * dropped. This is the new "fast" path mtu
 760                          * discovery.
 761                          */
 762                         tcp_simple_retransmit(sk);
 763                         release_sock(sk);
 764                 } /* else let the usual retransmit timer handle it */
 765         }
 766 }
 767
 768 /*
 769  * This routine is called by the ICMP module when it gets some
 770  * sort of error condition.  If err < 0 then the socket should
 771  * be closed and the error returned to the user.  If err > 0
 772  * it's just the icmp type << 8 | icmp code.  After adjustment
 773  * header points to the first 8 bytes of the tcp header.  We need
 774  * to find the appropriate port.
 775  *
 776  * The locking strategy used here is very "optimistic". When
 777  * someone else accesses the socket the ICMP is just dropped
 778  * and for some paths there is no check at all.
 779  * A more general error queue to queue errors for later handling
 780  * is probably better.
 781  *
 782  * sk->err and sk->err_soft should be atomic_t.
 783  */
 784
 785 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 786 {
 787         struct iphdr *iph = (struct iphdr*)dp;
 788         struct tcphdr *th;
 789         struct tcp_opt *tp;
 790         int type = skb->h.icmph->type;
 791         int code = skb->h.icmph->code;
 792         struct sock *sk;
 793         __u32 seq;
 794         int err;
 795
 796         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 797                 icmp_statistics.IcmpInErrors++;
 798                 return;
 799         }
 800
 801         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 802
 803         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
 804         if (sk == NULL || sk->state == TCP_TIME_WAIT) {
 805                 icmp_statistics.IcmpInErrors++;
 806                 return;
 807         }
 808
 809         tp = &sk->tp_pinfo.af_tcp;
 810         seq = ntohl(th->seq);
 811         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
 812                 net_statistics.OutOfWindowIcmps++;
 813                 return;
 814         }
 815
 816         switch (type) {
 817         case ICMP_SOURCE_QUENCH:
 818 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
 819                 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
 820                 tp->snd_cwnd = tp->snd_ssthresh;
 821                 tp->snd_cwnd_cnt = 0;
 822                 tp->high_seq = tp->snd_nxt;
 823 #endif
 824                 return;
 825         case ICMP_PARAMETERPROB:
 826                 err = EPROTO;
 827                 break;
 828         case ICMP_DEST_UNREACH:
 829                 if (code > NR_ICMP_UNREACH)
 830                         return;
 831
 832                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 833                         do_pmtu_discovery(sk, iph);
 834                         return;
 835                 }
 836
 837                 err = icmp_err_convert[code].errno;
 838                 break;
 839         case ICMP_TIME_EXCEEDED:
 840                 err = EHOSTUNREACH;
 841                 break;
 842         default:
 843                 return;
 844         }
 845
 846         switch (sk->state) {
 847                 struct open_request *req, *prev;
 848         case TCP_LISTEN:
 849                 /* Prevent race conditions with accept() -
 850                  * ICMP is unreliable.
 851                  */
 852                 if (atomic_read(&sk->sock_readers)) {
 853                         net_statistics.LockDroppedIcmps++;
 854                          /* If too many ICMPs get dropped on busy
 855                           * servers this needs to be solved differently.
 856                           */
 857                         return;
 858                 }
 859
 860                 /* The final ACK of the handshake should be already
 861                  * handled in the new socket context, not here.
 862                  * Strictly speaking - an ICMP error for the final
 863                  * ACK should set the opening flag, but that is too
 864                  * complicated right now.
 865                  */
 866                 if (!th->syn && !th->ack)
 867                         return;
 868
 869                 req = tcp_v4_search_req(tp, iph, th, &prev);
 870                 if (!req)
 871                         return;
 872                 if (seq != req->snt_isn) {
 873                         net_statistics.OutOfWindowIcmps++;
 874                         return;
 875                 }
 876                 if (req->sk) {
 877                         /*
 878                          * Already in ESTABLISHED and a big socket is created,
 879                          * set error code there.
 880                          * The error will _not_ be reported in the accept(),
 881                          * but only with the next operation on the socket after
 882                          * accept.
 883                          */
 884                         sk = req->sk;
 885                 } else {
 886                         /*
 887                          * Still in SYN_RECV, just remove it silently.
 888                          * There is no good way to pass the error to the newly
 889                          * created socket, and POSIX does not want network
 890                          * errors returned from accept().
 891                          */
 892                         tp->syn_backlog--;
 893                         tcp_synq_unlink(tp, req, prev);
 894                         req->class->destructor(req);
 895                         tcp_openreq_free(req);
 896                         return;
 897                 }
 898                 break;
 899         case TCP_SYN_SENT:
 900         case TCP_SYN_RECV:  /* Cannot happen */
 901                 if (!th->syn)
 902                         return;
 903                 tcp_statistics.TcpAttemptFails++;
 904                 sk->err = err;
 905                 sk->zapped = 1;
 906                 mb();
 907                 sk->error_report(sk);
 908                 return;
 909         }
 910
 911         /* If we've already connected we will keep trying
 912          * until we time out, or the user gives up.
 913          *
 914          * rfc1122 4.2.3.9 allows to consider as hard errors
 915          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 916          * but it is obsoleted by pmtu discovery).
 917          *
 918          * Note, that in modern internet, where routing is unreliable
 919          * and in each dark corner broken firewalls sit, sending random
 920          * errors ordered by their masters even this two messages finally lose
 921          * their original sense (even Linux sends invalid PORT_UNREACHs)
 922          *
 923          * Now we are in compliance with RFCs.
 924          *                                                      --ANK (980905)
 925          */
 926
 927         if (sk->ip_recverr) {
 928                 /* This code isn't serialized with the socket code */
 929                 /* ANK (980927) ... which is harmless now,
 930                    sk->err's may be safely lost.
 931                  */
 932                 sk->err = err;
 933                 mb();
 934                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 935         } else  { /* Only an error on timeout */
 936                 sk->err_soft = err;
 937                 mb();
 938         }
 939 }
 940
 941 /* This routine computes an IPv4 TCP checksum. */
 942 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 943                        struct sk_buff *skb)
 944 {
 945         th->check = 0;
 946         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
 947                                  csum_partial((char *)th, th->doff<<2, skb->csum));
 948 }
 949
 950 /*
 951  *      This routine will send an RST to the other tcp.
 952  *
 953  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 954  *                    for reset.
 955  *      Answer: if a packet caused RST, it is not for a socket
 956  *              existing in our system, if it is matched to a socket,
 957  *              it is just duplicate segment or bug in other side's TCP.
 958  *              So that we build reply only basing on parameters
 959  *              arrived with segment.
 960  *      Exception: precedence violation. We do not implement it in any case.
 961  */
 962
 963 static void tcp_v4_send_reset(struct sk_buff *skb)
 964 {
 965         struct tcphdr *th = skb->h.th;
 966         struct tcphdr rth;
 967         struct ip_reply_arg arg;
 968
 969         /* Never send a reset in response to a reset. */
 970         if (th->rst)
 971                 return;
 972
 973         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
 974 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 975                 if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
 976                         icmp_send(skb, ICMP_DEST_UNREACH,
 977                                   ICMP_PORT_UNREACH, 0);
 978 #endif
 979                 return;
 980         }
 981
 982         /* Swap the send and the receive. */
 983         memset(&rth, 0, sizeof(struct tcphdr));
 984         rth.dest = th->source;
 985         rth.source = th->dest;
 986         rth.doff = sizeof(struct tcphdr)/4;
 987         rth.rst = 1;
 988
 989         if (th->ack) {
 990                 rth.seq = th->ack_seq;
 991         } else {
 992                 rth.ack = 1;
 993                 rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
 994         }
 995
 996         memset(&arg, 0, sizeof arg);
 997         arg.iov[0].iov_base = (unsigned char *)&rth;
 998         arg.iov[0].iov_len  = sizeof rth;
 999         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1000                                       skb->nh.iph->saddr, /*XXX*/
1001                                       sizeof(struct tcphdr),
1002                                       IPPROTO_TCP,
1003                                       0);
1004         arg.n_iov = 1;
1005         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1006
1007         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1008
1009         tcp_statistics.TcpOutSegs++;
1010         tcp_statistics.TcpOutRsts++;
1011 }
1012
1013 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1014
1015 /*
1016    Seems, I never wrote nothing more stupid.
1017    I hope Gods will forgive me, but I cannot forgive myself 8)
1018                                                 --ANK (981001)
1019  */
1020
1021 static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
1022 {
1023         struct iphdr *iph = skb->nh.iph;
1024         struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1025         struct sock *sk;
1026         int i;
1027
1028         for (i=0; i<TCP_LHTABLE_SIZE; i++) {
1029                 for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
1030                         struct open_request *dummy;
1031                         if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
1032                                               th, &dummy) &&
1033                             (!sk->bound_dev_if ||
1034                              sk->bound_dev_if == skb->dev->ifindex))
1035                                 return sk;
1036                 }
1037         }
1038         return NULL;
1039 }
1040
1041 /*
1042  *      Check whether a received TCP packet might be for one of our
1043  *      connections.
1044  */
1045
1046 int tcp_chkaddr(struct sk_buff *skb)
1047 {
1048         struct iphdr *iph = skb->nh.iph;
1049         struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1050         struct sock *sk;
1051
1052         sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
1053                            th->dest, skb->dev->ifindex);
1054
1055         if (!sk)
1056                 return tcp_v4_search_proxy_openreq(skb) != NULL;
1057
1058         if (sk->state == TCP_LISTEN) {
1059                 struct open_request *dummy;
1060                 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
1061                                       th, &dummy) &&
1062                     (!sk->bound_dev_if ||
1063                      sk->bound_dev_if == skb->dev->ifindex))
1064                         return 1;
1065         }
1066
1067         /* 0 means accept all LOCAL addresses here, not all the world... */
1068
1069         if (sk->rcv_saddr == 0)
1070                 return 0;
1071
1072         return 1;
1073 }
1074 #endif
1075
1076 /*
1077  *      Send a SYN-ACK after having received an ACK.
1078  *      This still operates on a open_request only, not on a big
1079  *      socket.
1080  */
1081 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1082 {
1083         struct rtable *rt;
1084         struct ip_options *opt;
1085         struct sk_buff * skb;
1086         int mss;
1087
1088         /* First, grab a route. */
1089         opt = req->af.v4_req.opt;
1090         if(ip_route_output(&rt, ((opt && opt->srr) ?
1091                                  opt->faddr :
1092                                  req->af.v4_req.rmt_addr),
1093                            req->af.v4_req.loc_addr,
1094                            RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
1095                            sk->bound_dev_if)) {
1096                 ip_statistics.IpOutNoRoutes++;
1097                 return;
1098         }
1099         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1100                 ip_rt_put(rt);
1101                 ip_statistics.IpOutNoRoutes++;
1102                 return;
1103         }
1104
1105         mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1106
1107         skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
1108         if (skb) {
1109                 struct tcphdr *th = skb->h.th;
1110
1111 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1112                 th->source = req->lcl_port; /* LVE */
1113 #endif
1114
1115                 th->check = tcp_v4_check(th, skb->len,
1116                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1117                                          csum_partial((char *)th, skb->len, skb->csum));
1118
1119                 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1120                                       req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1121         }
1122         ip_rt_put(rt);
1123 }
1124
1125 /*
1126  *      IPv4 open_request destructor.
1127  */
1128 static void tcp_v4_or_free(struct open_request *req)
1129 {
1130         if(!req->sk && req->af.v4_req.opt)
1131                 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1132 }
1133
1134 static inline void syn_flood_warning(struct sk_buff *skb)
1135 {
1136         static unsigned long warntime;
1137
1138         if (jiffies - warntime > HZ*60) {
1139                 warntime = jiffies;
1140                 printk(KERN_INFO
1141                        "possible SYN flooding on port %d. Sending cookies.\n",
1142                        ntohs(skb->h.th->dest));
1143         }
1144 }
1145
1146 /*
1147  * Save and compile IPv4 options into the open_request if needed.
1148  */
1149 static inline struct ip_options *
1150 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1151 {
1152         struct ip_options *opt = &(IPCB(skb)->opt);
1153         struct ip_options *dopt = NULL;
1154
1155         if (opt && opt->optlen) {
1156                 int opt_size = optlength(opt);
1157                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1158                 if (dopt) {
1159                         if (ip_options_echo(dopt, skb)) {
1160                                 kfree_s(dopt, opt_size);
1161                                 dopt = NULL;
1162                         }
1163                 }
1164         }
1165         return dopt;
1166 }
1167
1168 /*
1169  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1170  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1171  * It would be better to replace it with a global counter for all sockets
1172  * but then some measure against one socket starving all other sockets
1173  * would be needed.
1174  */
1175 int sysctl_max_syn_backlog = 128;
1176
1177 struct or_calltable or_ipv4 = {
1178         tcp_v4_send_synack,
1179         tcp_v4_or_free,
1180         tcp_v4_send_reset
1181 };
1182
1183 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1184 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1185
1186 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
1187 {
1188         struct tcp_opt tp;
1189         struct open_request *req;
1190         struct tcphdr *th = skb->h.th;
1191         __u32 saddr = skb->nh.iph->saddr;
1192         __u32 daddr = skb->nh.iph->daddr;
1193 #ifdef CONFIG_SYN_COOKIES
1194         int want_cookie = 0;
1195 #else
1196 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1197 #endif
1198
1199         /* If the socket is dead, don't accept the connection.  */
1200         if (sk->dead)
1201                 goto dead;
1202
1203         /* Never answer to SYNs send to broadcast or multicast */
1204         if (((struct rtable *)skb->dst)->rt_flags &
1205             (RTCF_BROADCAST|RTCF_MULTICAST))
1206                 goto drop;
1207
1208         /* XXX: Check against a global syn pool counter. */
1209         if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1210 #ifdef CONFIG_SYN_COOKIES
1211                 if (sysctl_tcp_syncookies) {
1212                         syn_flood_warning(skb);
1213                         want_cookie = 1;
1214                 } else
1215 #endif
1216                 goto drop;
1217         } else {
1218                 if (isn == 0)
1219                         isn = tcp_v4_init_sequence(sk, skb);
1220                 BACKLOG(sk)++;
1221         }
1222
1223         req = tcp_openreq_alloc();
1224         if (req == NULL) {
1225                 goto dropbacklog;
1226         }
1227
1228         req->rcv_wnd = 0;               /* So that tcp_send_synack() knows! */
1229
1230         req->rcv_isn = TCP_SKB_CB(skb)->seq;
1231         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1232
1233         tp.mss_clamp = 65535;
1234         tcp_parse_options(NULL, th, &tp, want_cookie);
1235         if (tp.mss_clamp == 65535)
1236                 tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
1237
1238         if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
1239                 tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
1240         req->mss = tp.mss_clamp;
1241
1242         if (tp.saw_tstamp)
1243                 req->ts_recent = tp.rcv_tsval;
1244         req->tstamp_ok = tp.tstamp_ok;
1245         req->sack_ok = tp.sack_ok;
1246         req->snd_wscale = tp.snd_wscale;
1247         req->wscale_ok = tp.wscale_ok;
1248         req->rmt_port = th->source;
1249 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1250         req->lcl_port = th->dest ; /* LVE */
1251 #endif
1252         req->af.v4_req.loc_addr = daddr;
1253         req->af.v4_req.rmt_addr = saddr;
1254
1255         /* Note that we ignore the isn passed from the TIME_WAIT
1256          * state here. That's the price we pay for cookies.
1257          */
1258         if (want_cookie)
1259                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1260
1261         req->snt_isn = isn;
1262
1263         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1264
1265         req->class = &or_ipv4;
1266         req->retrans = 0;
1267         req->sk = NULL;
1268
1269         tcp_v4_send_synack(sk, req);
1270
1271         if (want_cookie) {
1272                 if (req->af.v4_req.opt)
1273                         kfree(req->af.v4_req.opt);
1274                 tcp_v4_or_free(req);
1275                 tcp_openreq_free(req);
1276         } else {
1277                 req->expires = jiffies + TCP_TIMEOUT_INIT;
1278                 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1279                 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1280         }
1281
1282         return 0;
1283
1284 dead:
1285         SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1286         tcp_statistics.TcpAttemptFails++;
1287         return -ENOTCONN; /* send reset */
1288
1289 dropbacklog:
1290         if (!want_cookie)
1291                 BACKLOG(sk)--;
1292 drop:
1293         tcp_statistics.TcpAttemptFails++;
1294         return 0;
1295 }
1296
1297 /* This is not only more efficient than what we used to do, it eliminates
1298  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
1299  *
1300  * This function wants to be moved to a common for IPv[46] file. --ANK
1301  */
1302 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
1303 {
1304         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
1305
1306         if(newsk != NULL) {
1307                 struct tcp_opt *newtp;
1308
1309                 memcpy(newsk, sk, sizeof(*newsk));
1310                 newsk->sklist_next = NULL;
1311                 newsk->state = TCP_SYN_RECV;
1312
1313                 /* Clone the TCP header template */
1314                 newsk->dport = req->rmt_port;
1315
1316                 atomic_set(&newsk->sock_readers, 0);
1317                 atomic_set(&newsk->rmem_alloc, 0);
1318                 skb_queue_head_init(&newsk->receive_queue);
1319                 atomic_set(&newsk->wmem_alloc, 0);
1320                 skb_queue_head_init(&newsk->write_queue);
1321                 atomic_set(&newsk->omem_alloc, 0);
1322
1323                 newsk->done = 0;
1324                 newsk->proc = 0;
1325                 newsk->pair = NULL;
1326                 skb_queue_head_init(&newsk->back_log);
1327                 skb_queue_head_init(&newsk->error_queue);
1328
1329                 /* Now setup tcp_opt */
1330                 newtp = &(newsk->tp_pinfo.af_tcp);
1331                 newtp->pred_flags = 0;
1332                 newtp->rcv_nxt = req->rcv_isn + 1;
1333                 newtp->snd_nxt = req->snt_isn + 1;
1334                 newtp->snd_una = req->snt_isn + 1;
1335                 newtp->srtt = 0;
1336                 newtp->ato = 0;
1337                 newtp->snd_wl1 = req->rcv_isn;
1338                 newtp->snd_wl2 = req->snt_isn;
1339
1340                 /* RFC1323: The window in SYN & SYN/ACK segments
1341                  * is never scaled.
1342                  */
1343                 newtp->snd_wnd = ntohs(skb->h.th->window);
1344
1345                 newtp->max_window = newtp->snd_wnd;
1346                 newtp->pending = 0;
1347                 newtp->retransmits = 0;
1348                 newtp->last_ack_sent = req->rcv_isn + 1;
1349                 newtp->backoff = 0;
1350                 newtp->mdev = TCP_TIMEOUT_INIT;
1351                 newtp->snd_cwnd = 1;
1352                 newtp->rto = TCP_TIMEOUT_INIT;
1353                 newtp->packets_out = 0;
1354                 newtp->fackets_out = 0;
1355                 newtp->retrans_out = 0;
1356                 newtp->high_seq = 0;
1357                 newtp->snd_ssthresh = 0x7fffffff;
1358                 newtp->snd_cwnd_cnt = 0;
1359                 newtp->dup_acks = 0;
1360                 newtp->delayed_acks = 0;
1361                 init_timer(&newtp->retransmit_timer);
1362                 newtp->retransmit_timer.function = &tcp_retransmit_timer;
1363                 newtp->retransmit_timer.data = (unsigned long) newsk;
1364                 init_timer(&newtp->delack_timer);
1365                 newtp->delack_timer.function = &tcp_delack_timer;
1366                 newtp->delack_timer.data = (unsigned long) newsk;
1367                 skb_queue_head_init(&newtp->out_of_order_queue);
1368                 newtp->send_head = newtp->retrans_head = NULL;
1369                 newtp->rcv_wup = req->rcv_isn + 1;
1370                 newtp->write_seq = req->snt_isn + 1;
1371                 newtp->copied_seq = req->rcv_isn + 1;
1372
1373                 newtp->saw_tstamp = 0;
1374                 newtp->mss_clamp = req->mss;
1375
1376                 init_timer(&newtp->probe_timer);
1377                 newtp->probe_timer.function = &tcp_probe_timer;
1378                 newtp->probe_timer.data = (unsigned long) newsk;
1379                 newtp->probes_out = 0;
1380                 newtp->syn_seq = req->rcv_isn;
1381                 newtp->fin_seq = req->rcv_isn;
1382                 newtp->urg_data = 0;
1383                 tcp_synq_init(newtp);
1384                 newtp->syn_backlog = 0;
1385                 if (skb->len >= 536)
1386                         newtp->last_seg_size = skb->len;
1387
1388                 /* Back to base struct sock members. */
1389                 newsk->err = 0;
1390                 newsk->ack_backlog = 0;
1391                 newsk->max_ack_backlog = SOMAXCONN;
1392                 newsk->priority = 0;
1393
1394                 /* IP layer stuff */
1395                 newsk->timeout = 0;
1396                 init_timer(&newsk->timer);
1397                 newsk->timer.function = &net_timer;
1398                 newsk->timer.data = (unsigned long) newsk;
1399                 newsk->socket = NULL;
1400
1401                 newtp->tstamp_ok = req->tstamp_ok;
1402                 if((newtp->sack_ok = req->sack_ok) != 0)
1403                         newtp->num_sacks = 0;
1404                 newtp->window_clamp = req->window_clamp;
1405                 newtp->rcv_wnd = req->rcv_wnd;
1406                 newtp->wscale_ok = req->wscale_ok;
1407                 if (newtp->wscale_ok) {
1408                         newtp->snd_wscale = req->snd_wscale;
1409                         newtp->rcv_wscale = req->rcv_wscale;
1410                 } else {
1411                         newtp->snd_wscale = newtp->rcv_wscale = 0;
1412                         newtp->window_clamp = min(newtp->window_clamp,65535);
1413                 }
1414                 if (newtp->tstamp_ok) {
1415                         newtp->ts_recent = req->ts_recent;
1416                         newtp->ts_recent_stamp = jiffies;
1417                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
1418                 } else {
1419                         newtp->tcp_header_len = sizeof(struct tcphdr);
1420                 }
1421         }
1422         return newsk;
1423 }
1424
1425 /*
1426  * The three way handshake has completed - we got a valid synack -
1427  * now create the new socket.
1428  */
1429 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1430                                    struct open_request *req,
1431                                    struct dst_entry *dst)
1432 {
1433         struct ip_options *opt = req->af.v4_req.opt;
1434         struct tcp_opt *newtp;
1435         struct sock *newsk;
1436
1437         if (sk->ack_backlog > sk->max_ack_backlog)
1438                 goto exit; /* head drop */
1439         if (dst == NULL) {
1440                 struct rtable *rt;
1441
1442                 if (ip_route_output(&rt,
1443                         opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1444                         req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
1445                         return NULL;
1446                 dst = &rt->u.dst;
1447         }
1448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1449         /* The new socket created for transparent proxy may fall
1450          * into a non-existed bind bucket because sk->num != newsk->num.
1451          * Ensure existance of the bucket now. The placement of the check
1452          * later will require to destroy just created newsk in the case of fail.
1453          * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
1454          */
1455         if (tcp_bucket_check(ntohs(skb->h.th->dest)))
1456                 goto exit;
1457 #endif
1458
1459         newsk = tcp_create_openreq_child(sk, req, skb);
1460         if (!newsk)
1461                 goto exit;
1462
1463         sk->tp_pinfo.af_tcp.syn_backlog--;
1464         sk->ack_backlog++;
1465
1466         newsk->dst_cache = dst;
1467
1468         newtp = &(newsk->tp_pinfo.af_tcp);
1469         newsk->daddr = req->af.v4_req.rmt_addr;
1470         newsk->saddr = req->af.v4_req.loc_addr;
1471         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1472 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1473         newsk->num = ntohs(skb->h.th->dest);
1474         newsk->sport = req->lcl_port;
1475 #endif
1476         newsk->opt = req->af.v4_req.opt;
1477         newtp->ext_header_len = 0;
1478         if (newsk->opt)
1479                 newtp->ext_header_len = newsk->opt->optlen;
1480
1481         tcp_sync_mss(newsk, dst->pmtu);
1482         newtp->rcv_mss = newtp->mss_clamp;
1483
1484         /* It would be better to use newtp->mss_clamp here */
1485         if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
1486                 newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
1487         if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
1488                 newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
1489
1490         tcp_v4_hash(newsk);
1491         add_to_prot_sklist(newsk);
1492         sk->data_ready(sk, 0); /* Deliver SIGIO */
1493
1494         return newsk;
1495
1496 exit:
1497         dst_release(dst);
1498         return NULL;
1499 }
1500
1501 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1502 {
1503         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1504         struct open_request *req, *prev;
1505
1506         req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1507         if (!req)
1508                 return;
1509         /* Sequence number check required by RFC793 */
1510         if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
1511             after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1512                 return;
1513         tcp_synq_unlink(tp, req, prev);
1514         (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
1515         req->class->destructor(req);
1516         tcp_openreq_free(req);
1517
1518         net_statistics.EmbryonicRsts++;
1519 }
1520
1521 /* Check for embryonic sockets (open_requests) We check packets with
1522  * only the SYN bit set against the open_request queue too: This
1523  * increases connection latency a bit, but is required to detect
1524  * retransmitted SYNs.
1525  */
1526 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1527 {
1528         struct tcphdr *th = skb->h.th;
1529         u32 flg = ((u32 *)th)[3];
1530
1531         /* Check for RST */
1532         if (flg & __constant_htonl(0x00040000)) {
1533                 tcp_v4_rst_req(sk, skb);
1534                 return NULL;
1535         }
1536
1537         /* Check for SYN|ACK */
1538         if (flg & __constant_htonl(0x00120000)) {
1539                 struct open_request *req, *dummy;
1540                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1541
1542                 /* Find possible connection requests. */
1543                 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1544                 if (req) {
1545                         sk = tcp_check_req(sk, skb, req);
1546                 }
1547 #ifdef CONFIG_SYN_COOKIES
1548                 else {
1549                         sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1550                 }
1551 #endif
1552         }
1553         return sk;
1554 }
1555
1556 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1557 {
1558 #ifdef CONFIG_FILTER
1559         if (sk->filter)
1560         {
1561                 if (sk_filter(skb, sk->filter_data, sk->filter))
1562                         goto discard;
1563         }
1564 #endif /* CONFIG_FILTER */
1565
1566         /*
1567          *      socket locking is here for SMP purposes as backlog rcv
1568          *      is currently called with bh processing disabled.
1569          */
1570         lock_sock(sk);
1571
1572         /*
1573          * This doesn't check if the socket has enough room for the packet.
1574          * Either process the packet _without_ queueing it and then free it,
1575          * or do the check later.
1576          */
1577         skb_set_owner_r(skb, sk);
1578
1579         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1580                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1581                         goto reset;
1582                 release_sock(sk);
1583                 return 0;
1584         }
1585
1586
1587         if (sk->state == TCP_LISTEN) {
1588                 struct sock *nsk;
1589
1590                 nsk = tcp_v4_hnd_req(sk, skb);
1591                 if (!nsk)
1592                         goto discard;
1593                 lock_sock(nsk);
1594                 release_sock(sk);
1595                 sk = nsk;
1596         }
1597
1598         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1599                 goto reset;
1600         release_sock(sk);
1601         return 0;
1602
1603 reset:
1604         tcp_v4_send_reset(skb);
1605 discard:
1606         kfree_skb(skb);
1607         /* Be careful here. If this function gets more complicated and
1608          * gcc suffers from register pressure on the x86, sk (in %ebx)
1609          * might be destroyed here. This current version compiles correctly,
1610          * but you have been warned.
1611          */
1612         release_sock(sk);
1613         return 0;
1614 }
1615
1616 /*
1617  *      From tcp_input.c
1618  */
1619
1620 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1621 {
1622         struct tcphdr *th;
1623         struct sock *sk;
1624
1625         if (skb->pkt_type!=PACKET_HOST)
1626                 goto discard_it;
1627
1628         th = skb->h.th;
1629
1630         /* Pull up the IP header. */
1631         __skb_pull(skb, skb->h.raw - skb->data);
1632
1633         /* Count it even if it's bad */
1634         tcp_statistics.TcpInSegs++;
1635
1636         if (len < sizeof(struct tcphdr))
1637                 goto bad_packet;
1638
1639         /* Try to use the device checksum if provided. */
1640         switch (skb->ip_summed) {
1641         case CHECKSUM_NONE:
1642                 skb->csum = csum_partial((char *)th, len, 0);
1643         case CHECKSUM_HW:
1644                 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1645                         NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1646                                         "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1647                                         "len=%d/%d/%d\n",
1648                                         NIPQUAD(skb->nh.iph->saddr),
1649                                         ntohs(th->source),
1650                                         NIPQUAD(skb->nh.iph->daddr),
1651                                         ntohs(th->dest),
1652                                         len, skb->len,
1653                                         ntohs(skb->nh.iph->tot_len)));
1654         bad_packet:
1655                         tcp_statistics.TcpInErrs++;
1656                         goto discard_it;
1657                 }
1658         default:
1659                 /* CHECKSUM_UNNECESSARY */
1660         }
1661
1662 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1663         if (IPCB(skb)->redirport)
1664                 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1665                                          skb->nh.iph->daddr, skb->dev,
1666                                          IPCB(skb)->redirport, skb->dev->ifindex);
1667         else {
1668 #endif
1669                 sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
1670                                      skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1671 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1672                 if (!sk)
1673                         sk = tcp_v4_search_proxy_openreq(skb);
1674         }
1675 #endif
1676         if (!sk)
1677                 goto no_tcp_socket;
1678         if(!ipsec_sk_policy(sk,skb))
1679                 goto discard_it;
1680
1681         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1682         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1683                                     len - th->doff*4);
1684         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1685
1686         skb->used = 0;
1687
1688         if (sk->state == TCP_TIME_WAIT)
1689                 goto do_time_wait;
1690         if (!atomic_read(&sk->sock_readers))
1691                 return tcp_v4_do_rcv(sk, skb);
1692
1693         __skb_queue_tail(&sk->back_log, skb);
1694         return 0;
1695
1696 no_tcp_socket:
1697         tcp_v4_send_reset(skb);
1698
1699 discard_it:
1700         /* Discard frame. */
1701         kfree_skb(skb);
1702         return 0;
1703
1704 do_time_wait:
1705         if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1706                                       skb, th, skb->len))
1707                 goto no_tcp_socket;
1708         goto discard_it;
1709 }
1710
1711 int tcp_v4_rebuild_header(struct sock *sk)
1712 {
1713         struct rtable *rt = (struct rtable *)sk->dst_cache;
1714         __u32 new_saddr;
1715         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1716
1717         if(rt == NULL)
1718                 return 0;
1719
1720         /* Force route checking if want_rewrite.
1721          * The idea is good, the implementation is disguisting.
1722          * Well, if I made bind on this socket, you cannot randomly ovewrite
1723          * its source address. --ANK
1724          */
1725         if (want_rewrite) {
1726                 int tmp;
1727                 struct rtable *new_rt;
1728                 __u32 old_saddr = rt->rt_src;
1729
1730                 /* Query new route using another rt buffer */
1731                 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1732                                         RT_TOS(sk->ip_tos)|sk->localroute,
1733                                         sk->bound_dev_if);
1734
1735                 /* Only useful if different source addrs */
1736                 if (tmp == 0) {
1737                         /*
1738                          *      Only useful if different source addrs
1739                          */
1740                         if (new_rt->rt_src != old_saddr ) {
1741                                 dst_release(sk->dst_cache);
1742                                 sk->dst_cache = &new_rt->u.dst;
1743                                 rt = new_rt;
1744                                 goto do_rewrite;
1745                         }
1746                         dst_release(&new_rt->u.dst);
1747                 }
1748         }
1749         if (rt->u.dst.obsolete) {
1750                 int err;
1751                 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1752                 if (err) {
1753                         sk->err_soft=-err;
1754                         sk->error_report(sk);
1755                         return -1;
1756                 }
1757                 dst_release(xchg(&sk->dst_cache, &rt->u.dst));
1758         }
1759
1760         return 0;
1761
1762 do_rewrite:
1763         new_saddr = rt->rt_src;
1764
1765         /* Ouch!, this should not happen. */
1766         if (!sk->saddr || !sk->rcv_saddr) {
1767                 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1768                        "saddr=%08lX rcv_saddr=%08lX\n",
1769                        ntohl(sk->saddr),
1770                        ntohl(sk->rcv_saddr));
1771                 return 0;
1772         }
1773
1774         if (new_saddr != sk->saddr) {
1775                 if (sysctl_ip_dynaddr > 1) {
1776                         printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1777                                "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1778                                NIPQUAD(sk->saddr),
1779                                NIPQUAD(new_saddr));
1780                 }
1781
1782                 sk->saddr = new_saddr;
1783                 sk->rcv_saddr = new_saddr;
1784                 tcp_v4_rehash(sk);
1785         }
1786
1787         return 0;
1788 }
1789
1790 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1791 {
1792         return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1793                              skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1794 }
1795
1796 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1797 {
1798         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1799
1800         sin->sin_family         = AF_INET;
1801         sin->sin_addr.s_addr    = sk->daddr;
1802         sin->sin_port           = sk->dport;
1803 }
1804
1805 struct tcp_func ipv4_specific = {
1806         ip_queue_xmit,
1807         tcp_v4_send_check,
1808         tcp_v4_rebuild_header,
1809         tcp_v4_conn_request,
1810         tcp_v4_syn_recv_sock,
1811         tcp_v4_get_sock,
1812         sizeof(struct iphdr),
1813
1814         ip_setsockopt,
1815         ip_getsockopt,
1816         v4_addr2sockaddr,
1817         sizeof(struct sockaddr_in)
1818 };
1819
1820 /* NOTE: A lot of things set to zero explicitly by call to
1821  *       sk_alloc() so need not be done here.
1822  */
1823 static int tcp_v4_init_sock(struct sock *sk)
1824 {
1825         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1826
1827         skb_queue_head_init(&tp->out_of_order_queue);
1828         tcp_init_xmit_timers(sk);
1829
1830         tp->rto  = TCP_TIMEOUT_INIT;            /*TCP_WRITE_TIME*/
1831         tp->mdev = TCP_TIMEOUT_INIT;
1832         tp->mss_clamp = ~0;
1833
1834         /* See draft-stevens-tcpca-spec-01 for discussion of the
1835          * initialization of these values.
1836          */
1837         tp->snd_cwnd = 1;
1838         tp->snd_cwnd_cnt = 0;
1839         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1840
1841         sk->state = TCP_CLOSE;
1842         sk->max_ack_backlog = SOMAXCONN;
1843         tp->rcv_mss = 536;
1844
1845         sk->write_space = tcp_write_space;
1846
1847         /* Init SYN queue. */
1848         tcp_synq_init(tp);
1849
1850         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1851
1852         return 0;
1853 }
1854
1855 static int tcp_v4_destroy_sock(struct sock *sk)
1856 {
1857         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1858         struct sk_buff *skb;
1859
1860         tcp_clear_xmit_timers(sk);
1861
1862         if (sk->keepopen)
1863                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1864
1865         /* Cleanup up the write buffer. */
1866         while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
1867                 kfree_skb(skb);
1868
1869         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1870         while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
1871                 kfree_skb(skb);
1872
1873         /* Clean up a locked TCP bind bucket, this only happens if a
1874          * port is allocated for a socket, but it never fully connects.
1875          * In which case we will find num to be non-zero and daddr to
1876          * be zero.
1877          */
1878         if(sk->daddr == 0 && sk->num != 0)
1879                 tcp_bucket_unlock(sk);
1880
1881         return 0;
1882 }
1883
1884 struct proto tcp_prot = {
1885         (struct sock *)&tcp_prot,       /* sklist_next */
1886         (struct sock *)&tcp_prot,       /* sklist_prev */
1887         tcp_close,                      /* close */
1888         tcp_v4_connect,                 /* connect */
1889         tcp_accept,                     /* accept */
1890         NULL,                           /* retransmit */
1891         tcp_write_wakeup,               /* write_wakeup */
1892         tcp_read_wakeup,                /* read_wakeup */
1893         tcp_poll,                       /* poll */
1894         tcp_ioctl,                      /* ioctl */
1895         tcp_v4_init_sock,               /* init */
1896         tcp_v4_destroy_sock,            /* destroy */
1897         tcp_shutdown,                   /* shutdown */
1898         tcp_setsockopt,                 /* setsockopt */
1899         tcp_getsockopt,                 /* getsockopt */
1900         tcp_v4_sendmsg,                 /* sendmsg */
1901         tcp_recvmsg,                    /* recvmsg */
1902         NULL,                           /* bind */
1903         tcp_v4_do_rcv,                  /* backlog_rcv */
1904         tcp_v4_hash,                    /* hash */
1905         tcp_v4_unhash,                  /* unhash */
1906         tcp_v4_rehash,                  /* rehash */
1907         tcp_good_socknum,               /* good_socknum */
1908         tcp_v4_verify_bind,             /* verify_bind */
1909         128,                            /* max_header */
1910         0,                              /* retransmits */
1911         "TCP",                          /* name */
1912         0,                              /* inuse */
1913         0                               /* highestinuse */
1914 };
1915
1916
1917
1918 __initfunc(void tcp_v4_init(struct net_proto_family *ops))
1919 {
1920         int err;
1921
1922         tcp_inode.i_mode = S_IFSOCK;
1923         tcp_inode.i_sock = 1;
1924         tcp_inode.i_uid = 0;
1925         tcp_inode.i_gid = 0;
1926
1927         tcp_socket->inode = &tcp_inode;
1928         tcp_socket->state = SS_UNCONNECTED;
1929         tcp_socket->type=SOCK_RAW;
1930
1931         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
1932                 panic("Failed to create the TCP control socket.\n");
1933         tcp_socket->sk->allocation=GFP_ATOMIC;
1934         tcp_socket->sk->num = 256;              /* Don't receive any data */
1935         tcp_socket->sk->ip_ttl = MAXTTL;
1936 }