net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.176 1999/05/12 11:24:46 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics.
  42  *              Mike McLagan    :       Routing by source
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  *              Andi Kleen:             various fixes.
  45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  */
  49
  50 #include <linux/config.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/random.h>
  54 #include <linux/init.h>
  55 #include <linux/ipsec.h>
  56
  57 #include <net/icmp.h>
  58 #include <net/tcp.h>
  59 #include <net/ipv6.h>
  60
  61 #include <asm/segment.h>
  62
  63 #include <linux/inet.h>
  64 #include <linux/stddef.h>
  65
  66 extern int sysctl_tcp_timestamps;
  67 extern int sysctl_tcp_window_scaling;
  68 extern int sysctl_tcp_sack;
  69 extern int sysctl_tcp_syncookies;
  70 extern int sysctl_ip_dynaddr;
  71 extern __u32 sysctl_wmem_max;
  72 extern __u32 sysctl_rmem_max;
  73
  74 /* Check TCP sequence numbers in ICMP packets. */
  75 #define ICMP_MIN_LENGTH 8
  76
  77 /* Socket used for sending RSTs */
  78 struct inode tcp_inode;
  79 struct socket *tcp_socket=&tcp_inode.u.socket_i;
  80
  81 static void tcp_v4_send_reset(struct sk_buff *skb);
  82
  83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  84                        struct sk_buff *skb);
  85
  86 /* This is for sockets with full identity only.  Sockets here will always
  87  * be without wildcards and will have the following invariant:
  88  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  89  *
  90  * First half of the table is for sockets not in TIME_WAIT, second half
  91  * is for TIME_WAIT sockets only.
  92  */
  93 struct sock **tcp_ehash;
  94 int tcp_ehash_size;
  95
  96 /* Ok, let's try this, I give up, we do need a local binding
  97  * TCP hash as well as the others for fast bind/connect.
  98  */
  99 struct tcp_bind_bucket **tcp_bhash;
 100 int tcp_bhash_size;
 101
 102 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
 103  * where wildcard'd TCP sockets can exist.  Hash function here is just local
 104  * port number.
 105  */
 106 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
 107
 108 /* Register cache. */
 109 struct sock *tcp_regs[TCP_NUM_REGS];
 110
 111 /*
 112  * This array holds the first and last local port number.
 113  * For high-usage systems, use sysctl to change this to
 114  * 32768-61000
 115  */
 116 int sysctl_local_port_range[2] = { 1024, 4999 };
 117 int tcp_port_rover = (1024 - 1);
 118
 119 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 120                                  __u32 faddr, __u16 fport)
 121 {
 122         return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
 123 }
 124
 125 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 126 {
 127         __u32 laddr = sk->rcv_saddr;
 128         __u16 lport = sk->num;
 129         __u32 faddr = sk->daddr;
 130         __u16 fport = sk->dport;
 131
 132         return tcp_hashfn(laddr, lport, faddr, fport);
 133 }
 134
 135 /* Invariant, sk->num is non-zero. */
 136 void tcp_bucket_unlock(struct sock *sk)
 137 {
 138         struct tcp_bind_bucket *tb;
 139         unsigned short snum = sk->num;
 140
 141         SOCKHASH_LOCK_WRITE();
 142         for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
 143                 if(tb->port == snum) {
 144                         if(tb->owners == NULL &&
 145                            (tb->flags & TCPB_FLAG_LOCKED)) {
 146                                 tb->flags &= ~(TCPB_FLAG_LOCKED |
 147                                                TCPB_FLAG_FASTREUSE);
 148                                 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 149                         }
 150                         break;
 151                 }
 152         }
 153         SOCKHASH_UNLOCK_WRITE();
 154 }
 155
 156 /* The sockhash lock must be held as a writer here. */
 157 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
 158 {
 159         struct tcp_bind_bucket *tb;
 160
 161         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 162         if(tb != NULL) {
 163                 struct tcp_bind_bucket **head =
 164                         &tcp_bhash[tcp_bhashfn(snum)];
 165                 tb->port = snum;
 166                 tb->flags = TCPB_FLAG_LOCKED;
 167                 tb->owners = NULL;
 168                 if((tb->next = *head) != NULL)
 169                         tb->next->pprev = &tb->next;
 170                 *head = tb;
 171                 tb->pprev = head;
 172         }
 173         return tb;
 174 }
 175
 176 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 177 /* Ensure that the bound bucket for the port exists.
 178  * Return 0 on success.
 179  */
 180 static __inline__ int tcp_bucket_check(unsigned short snum)
 181 {
 182         struct tcp_bind_bucket *tb;
 183         int ret = 0;
 184
 185         SOCKHASH_LOCK_WRITE();
 186         tb = tcp_bhash[tcp_bhashfn(snum)];
 187         for( ; (tb && (tb->port != snum)); tb = tb->next)
 188                 ;
 189         if(tb == NULL && tcp_bucket_create(snum) == NULL)
 190                 ret = 1;
 191         SOCKHASH_UNLOCK_WRITE();
 192
 193         return ret;
 194 }
 195 #endif
 196
 197 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 198 {
 199         struct tcp_bind_bucket *tb;
 200         int result = 0;
 201
 202         SOCKHASH_LOCK_WRITE();
 203         for(tb = tcp_bhash[tcp_bhashfn(snum)];
 204             (tb && (tb->port != snum));
 205             tb = tb->next)
 206                 ;
 207         if(tb && tb->owners) {
 208                 /* Fast path for reuse ports, see include/net/tcp.h for a very
 209                  * detailed description of why this works, and why it is worth
 210                  * the effort at all. -DaveM
 211                  */
 212                 if((tb->flags & TCPB_FLAG_FASTREUSE)    &&
 213                    (sk->reuse != 0)) {
 214                         goto go_like_smoke;
 215                 } else {
 216                         struct sock *sk2;
 217                         int sk_reuse = sk->reuse;
 218
 219                         /* We must walk the whole port owner list in this case. -DaveM */
 220                         for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
 221                                 if (sk->bound_dev_if == sk2->bound_dev_if) {
 222                                         if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
 223                                                 if(!sk2->rcv_saddr              ||
 224                                                    !sk->rcv_saddr               ||
 225                                                    (sk2->rcv_saddr == sk->rcv_saddr))
 226                                                         break;
 227                                         }
 228                                 }
 229                         }
 230                         if(sk2 != NULL)
 231                                 result = 1;
 232                 }
 233         }
 234         if(result == 0) {
 235                 if(tb == NULL) {
 236                         if((tb = tcp_bucket_create(snum)) == NULL)
 237                                 result = 1;
 238                         else if (sk->reuse && sk->state != TCP_LISTEN)
 239                                 tb->flags |= TCPB_FLAG_FASTREUSE;
 240                 } else {
 241                         /* It could be pending garbage collection, this
 242                          * kills the race and prevents it from disappearing
 243                          * out from under us by the time we use it.  -DaveM
 244                          */
 245                         if(tb->owners == NULL) {
 246                                 if (!(tb->flags & TCPB_FLAG_LOCKED)) {
 247                                         tb->flags = (TCPB_FLAG_LOCKED |
 248                                                      ((sk->reuse &&
 249                                                        sk->state != TCP_LISTEN) ?
 250                                                       TCPB_FLAG_FASTREUSE : 0));
 251                                         tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
 252                                 } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
 253                                         /* Someone is in between the bind
 254                                          * and the actual connect or listen.
 255                                          * See if it was a legitimate reuse
 256                                          * and we are as well, else punt.
 257                                          */
 258                                         if (sk->reuse == 0 ||
 259                                             !(tb->flags & TCPB_FLAG_FASTREUSE))
 260                                                 result = 1;
 261                                 } else
 262                                         tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
 263                         }
 264                 }
 265         }
 266 go_like_smoke:
 267         SOCKHASH_UNLOCK_WRITE();
 268         return result;
 269 }
 270
 271 unsigned short tcp_good_socknum(void)
 272 {
 273         struct tcp_bind_bucket *tb;
 274         int low = sysctl_local_port_range[0];
 275         int high = sysctl_local_port_range[1];
 276         int remaining = (high - low) + 1;
 277         int rover;
 278
 279         SOCKHASH_LOCK_WRITE();
 280         rover = tcp_port_rover;
 281         do {
 282                 rover += 1;
 283                 if((rover < low) || (rover > high))
 284                         rover = low;
 285                 tb = tcp_bhash[tcp_bhashfn(rover)];
 286                 for( ; tb; tb = tb->next) {
 287                         if(tb->port == rover)
 288                                 goto next;
 289                 }
 290                 break;
 291         next:
 292         } while(--remaining > 0);
 293         tcp_port_rover = rover;
 294         tb = NULL;
 295         if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
 296                 rover = 0;
 297         if (tb != NULL)
 298                 tb->flags |= TCPB_FLAG_GOODSOCKNUM;
 299         SOCKHASH_UNLOCK_WRITE();
 300
 301         return rover;
 302 }
 303
 304 static void tcp_v4_hash(struct sock *sk)
 305 {
 306         if (sk->state != TCP_CLOSE) {
 307                 struct sock **skp;
 308
 309                 SOCKHASH_LOCK_WRITE();
 310                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 311                 if((sk->next = *skp) != NULL)
 312                         (*skp)->pprev = &sk->next;
 313                 *skp = sk;
 314                 sk->pprev = skp;
 315                 tcp_sk_bindify(sk);
 316                 SOCKHASH_UNLOCK_WRITE();
 317         }
 318 }
 319
 320 static void tcp_v4_unhash(struct sock *sk)
 321 {
 322         SOCKHASH_LOCK_WRITE();
 323         if(sk->pprev) {
 324                 if(sk->next)
 325                         sk->next->pprev = sk->pprev;
 326                 *sk->pprev = sk->next;
 327                 sk->pprev = NULL;
 328                 tcp_reg_zap(sk);
 329                 tcp_sk_unbindify(sk);
 330         }
 331         SOCKHASH_UNLOCK_WRITE();
 332 }
 333
 334 static void tcp_v4_rehash(struct sock *sk)
 335 {
 336         unsigned char state;
 337
 338         SOCKHASH_LOCK_WRITE();
 339         state = sk->state;
 340         if(sk->pprev != NULL) {
 341                 if(sk->next)
 342                         sk->next->pprev = sk->pprev;
 343                 *sk->pprev = sk->next;
 344                 sk->pprev = NULL;
 345                 tcp_reg_zap(sk);
 346         }
 347         if(state != TCP_CLOSE) {
 348                 struct sock **skp;
 349
 350                 if(state == TCP_LISTEN)
 351                         skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 352                 else
 353                         skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 354
 355                 if((sk->next = *skp) != NULL)
 356                         (*skp)->pprev = &sk->next;
 357                 *skp = sk;
 358                 sk->pprev = skp;
 359                 if(state == TCP_LISTEN)
 360                         tcp_sk_bindify(sk);
 361         }
 362         SOCKHASH_UNLOCK_WRITE();
 363 }
 364
 365 /* Don't inline this cruft.  Here are some nice properties to
 366  * exploit here.  The BSD API does not allow a listening TCP
 367  * to specify the remote port nor the remote address for the
 368  * connection.  So always assume those are both wildcarded
 369  * during the search since they can never be otherwise.
 370  */
 371 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 372 {
 373         struct sock *sk;
 374         struct sock *result = NULL;
 375         int score, hiscore;
 376
 377         hiscore=0;
 378         for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
 379                 if(sk->num == hnum) {
 380                         __u32 rcv_saddr = sk->rcv_saddr;
 381
 382                         score = 1;
 383                         if(rcv_saddr) {
 384                                 if (rcv_saddr != daddr)
 385                                         continue;
 386                                 score++;
 387                         }
 388                         if (sk->bound_dev_if) {
 389                                 if (sk->bound_dev_if != dif)
 390                                         continue;
 391                                 score++;
 392                         }
 393                         if (score == 3)
 394                                 return sk;
 395                         if (score > hiscore) {
 396                                 hiscore = score;
 397                                 result = sk;
 398                         }
 399                 }
 400         }
 401         return result;
 402 }
 403
 404 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 405  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 406  *
 407  * The sockhash lock must be held as a reader here.
 408  */
 409 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 410                                            u32 daddr, u16 dport, int dif)
 411 {
 412         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 413         __u16 hnum = ntohs(dport);
 414         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 415         struct sock *sk;
 416         int hash;
 417
 418         /* Check TCP register quick cache first. */
 419         sk = TCP_RHASH(sport);
 420         if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 421                 goto hit;
 422
 423         /* Optimize here for direct hit, only listening connections can
 424          * have wildcards anyways.
 425          */
 426         hash = tcp_hashfn(daddr, hnum, saddr, sport);
 427         for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
 428                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
 429                         if (sk->state == TCP_ESTABLISHED)
 430                                 TCP_RHASH(sport) = sk;
 431                         goto hit; /* You sunk my battleship! */
 432                 }
 433         }
 434         /* Must check for a TIME_WAIT'er before going to listener hash. */
 435         for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
 436                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 437                         goto hit;
 438         sk = tcp_v4_lookup_listener(daddr, hnum, dif);
 439 hit:
 440         return sk;
 441 }
 442
 443 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 444 {
 445         struct sock *sk;
 446
 447         SOCKHASH_LOCK_READ();
 448         sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
 449         SOCKHASH_UNLOCK_READ();
 450
 451         return sk;
 452 }
 453
 454 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 455 /* Cleaned up a little and adapted to new bind bucket scheme.
 456  * Oddly, this should increase performance here for
 457  * transparent proxy, as tests within the inner loop have
 458  * been eliminated. -DaveM
 459  */
 460 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 461                                         unsigned short rnum, unsigned long laddr,
 462                                         struct device *dev, unsigned short pnum,
 463                                         int dif)
 464 {
 465         struct sock *s, *result = NULL;
 466         int badness = -1;
 467         u32 paddr = 0;
 468         unsigned short hnum = ntohs(num);
 469         unsigned short hpnum = ntohs(pnum);
 470         int firstpass = 1;
 471
 472         if(dev && dev->ip_ptr) {
 473                 struct in_device *idev = dev->ip_ptr;
 474
 475                 if(idev->ifa_list)
 476                         paddr = idev->ifa_list->ifa_local;
 477         }
 478
 479         /* We must obtain the sockhash lock here, we are always
 480          * in BH context.
 481          */
 482         SOCKHASH_LOCK_READ_BH();
 483         {
 484                 struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
 485                 for( ; (tb && tb->port != hnum); tb = tb->next)
 486                         ;
 487                 if(tb == NULL)
 488                         goto next;
 489                 s = tb->owners;
 490         }
 491 pass2:
 492         for(; s; s = s->bind_next) {
 493                 int score = 0;
 494                 if(s->rcv_saddr) {
 495                         if((s->num != hpnum || s->rcv_saddr != paddr) &&
 496                            (s->num != hnum || s->rcv_saddr != laddr))
 497                                 continue;
 498                         score++;
 499                 }
 500                 if(s->daddr) {
 501                         if(s->daddr != raddr)
 502                                 continue;
 503                         score++;
 504                 }
 505                 if(s->dport) {
 506                         if(s->dport != rnum)
 507                                 continue;
 508                         score++;
 509                 }
 510                 if(s->bound_dev_if) {
 511                         if(s->bound_dev_if != dif)
 512                                 continue;
 513                         score++;
 514                 }
 515                 if(score == 4 && s->num == hnum) {
 516                         result = s;
 517                         goto gotit;
 518                 } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
 519                         result = s;
 520                         badness = score;
 521                 }
 522         }
 523 next:
 524         if(firstpass--) {
 525                 struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
 526                 for( ; (tb && tb->port != hpnum); tb = tb->next)
 527                         ;
 528                 if(tb) {
 529                         s = tb->owners;
 530                         goto pass2;
 531                 }
 532         }
 533 gotit:
 534         SOCKHASH_UNLOCK_READ_BH();
 535         return result;
 536 }
 537 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
 538
 539 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 540 {
 541         return secure_tcp_sequence_number(sk->saddr, sk->daddr,
 542                                           skb->h.th->dest,
 543                                           skb->h.th->source);
 544 }
 545
 546 /* Check that a TCP address is unique, don't allow multiple
 547  * connects to/from the same address.  Actually we can optimize
 548  * quite a bit, since the socket about to connect is still
 549  * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
 550  * use will exist, with a NULL owners list.  So check for that.
 551  * The good_socknum and verify_bind scheme we use makes this
 552  * work.
 553  */
 554 static int tcp_v4_unique_address(struct sock *sk)
 555 {
 556         struct tcp_bind_bucket *tb;
 557         unsigned short snum = sk->num;
 558         int retval = 1;
 559
 560         /* Freeze the hash while we snoop around. */
 561         SOCKHASH_LOCK_READ();
 562         tb = tcp_bhash[tcp_bhashfn(snum)];
 563         for(; tb; tb = tb->next) {
 564                 if(tb->port == snum && tb->owners != NULL) {
 565                         /* Almost certainly the re-use port case, search the real hashes
 566                          * so it actually scales.
 567                          */
 568                         sk = __tcp_v4_lookup(sk->daddr, sk->dport,
 569                                              sk->rcv_saddr, snum, sk->bound_dev_if);
 570                         SOCKHASH_UNLOCK_READ();
 571
 572                         if((sk != NULL) && (sk->state != TCP_LISTEN))
 573                                 retval = 0;
 574                         return retval;
 575                 }
 576         }
 577         SOCKHASH_UNLOCK_READ();
 578         return retval;
 579 }
 580
 581 /* This will initiate an outgoing connection. */
 582 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 583 {
 584         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 585         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 586         struct sk_buff *buff;
 587         struct rtable *rt;
 588         u32 daddr, nexthop;
 589         int tmp;
 590
 591         if (sk->state != TCP_CLOSE)
 592                 return(-EISCONN);
 593
 594         /* Don't allow a double connect. */
 595         if (sk->daddr)
 596                 return -EINVAL;
 597
 598         if (addr_len < sizeof(struct sockaddr_in))
 599                 return(-EINVAL);
 600
 601         if (usin->sin_family != AF_INET) {
 602                 static int complained;
 603                 if (usin->sin_family)
 604                         return(-EAFNOSUPPORT);
 605                 if (!complained++)
 606                         printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
 607         }
 608
 609         nexthop = daddr = usin->sin_addr.s_addr;
 610         if (sk->opt && sk->opt->srr) {
 611                 if (daddr == 0)
 612                         return -EINVAL;
 613                 nexthop = sk->opt->faddr;
 614         }
 615
 616         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 617                                RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
 618         if (tmp < 0)
 619                 return tmp;
 620
 621         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 622                 ip_rt_put(rt);
 623                 return -ENETUNREACH;
 624         }
 625
 626         dst_release(xchg(&sk->dst_cache, rt));
 627
 628         buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 629                             0, GFP_KERNEL);
 630
 631         if (buff == NULL)
 632                 return -ENOBUFS;
 633
 634         /* Socket has no identity, so lock_sock() is useless.  Also
 635          * since state==TCP_CLOSE (checked above) the socket cannot
 636          * possibly be in the hashes.  TCP hash locking is only
 637          * needed while checking quickly for a unique address.
 638          * However, the socket does need to be (and is) locked
 639          * in tcp_connect().
 640          * Perhaps this addresses all of ANK's concerns. 8-)  -DaveM
 641          */
 642         sk->dport = usin->sin_port;
 643         sk->daddr = rt->rt_dst;
 644         if (sk->opt && sk->opt->srr)
 645                 sk->daddr = daddr;
 646         if (!sk->saddr)
 647                 sk->saddr = rt->rt_src;
 648         sk->rcv_saddr = sk->saddr;
 649
 650         if (!tcp_v4_unique_address(sk)) {
 651                 kfree_skb(buff);
 652                 sk->daddr = 0;
 653                 return -EADDRNOTAVAIL;
 654         }
 655
 656         tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 657                                                    sk->sport, usin->sin_port);
 658
 659         tp->ext_header_len = 0;
 660         if (sk->opt)
 661                 tp->ext_header_len = sk->opt->optlen;
 662
 663         /* Reset mss clamp */
 664         tp->mss_clamp = ~0;
 665
 666         if (!ip_dont_fragment(sk, &rt->u.dst) &&
 667             rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
 668                 /* Clamp mss at maximum of 536 and user_mss.
 669                    Probably, user ordered to override tiny segment size
 670                    in gatewayed case.
 671                  */
 672                 tp->mss_clamp = max(tp->user_mss, 536);
 673         }
 674
 675         tcp_connect(sk, buff, rt->u.dst.pmtu);
 676         return 0;
 677 }
 678
 679 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 680 {
 681         int retval = -EINVAL;
 682
 683         /* Do sanity checking for sendmsg/sendto/send. */
 684         if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
 685                 goto out;
 686         if (msg->msg_name) {
 687                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
 688
 689                 if (msg->msg_namelen < sizeof(*addr))
 690                         goto out;
 691                 if (addr->sin_family && addr->sin_family != AF_INET)
 692                         goto out;
 693                 retval = -ENOTCONN;
 694                 if(sk->state == TCP_CLOSE)
 695                         goto out;
 696                 retval = -EISCONN;
 697                 if (addr->sin_port != sk->dport)
 698                         goto out;
 699                 if (addr->sin_addr.s_addr != sk->daddr)
 700                         goto out;
 701         }
 702         retval = tcp_do_sendmsg(sk, msg);
 703
 704 out:
 705         return retval;
 706 }
 707
 708
 709 /*
 710  * Do a linear search in the socket open_request list.
 711  * This should be replaced with a global hash table.
 712  */
 713 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 714                                               struct iphdr *iph,
 715                                               struct tcphdr *th,
 716                                               struct open_request **prevp)
 717 {
 718         struct open_request *req, *prev;
 719         __u16 rport = th->source;
 720
 721         /*      assumption: the socket is not in use.
 722          *      as we checked the user count on tcp_rcv and we're
 723          *      running from a soft interrupt.
 724          */
 725         prev = (struct open_request *) (&tp->syn_wait_queue);
 726         for (req = prev->dl_next; req; req = req->dl_next) {
 727                 if (req->af.v4_req.rmt_addr == iph->saddr &&
 728                     req->af.v4_req.loc_addr == iph->daddr &&
 729                     req->rmt_port == rport
 730 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 731                     && req->lcl_port == th->dest
 732 #endif
 733                     ) {
 734                         *prevp = prev;
 735                         return req;
 736                 }
 737                 prev = req;
 738         }
 739         return NULL;
 740 }
 741
 742
 743 /*
 744  * This routine does path mtu discovery as defined in RFC1191.
 745  */
 746 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
 747 {
 748         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 749
 750         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 751          * send out by Linux are always <576bytes so they should go through
 752          * unfragmented).
 753          */
 754         if (sk->state == TCP_LISTEN)
 755                 return;
 756
 757         bh_lock_sock(sk);
 758         if(sk->lock.users != 0)
 759                 goto out;
 760
 761         /* We don't check in the destentry if pmtu discovery is forbidden
 762          * on this route. We just assume that no packet_to_big packets
 763          * are send back when pmtu discovery is not active.
 764          * There is a small race when the user changes this flag in the
 765          * route, but I think that's acceptable.
 766          */
 767         if (sk->dst_cache == NULL)
 768                 goto out;
 769
 770         ip_rt_update_pmtu(sk->dst_cache, mtu);
 771         if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
 772             tp->pmtu_cookie > sk->dst_cache->pmtu) {
 773                 tcp_sync_mss(sk, sk->dst_cache->pmtu);
 774
 775                 /* Resend the TCP packet because it's
 776                  * clear that the old packet has been
 777                  * dropped. This is the new "fast" path mtu
 778                  * discovery.
 779                  */
 780                 tcp_simple_retransmit(sk);
 781         } /* else let the usual retransmit timer handle it */
 782 out:
 783         bh_unlock_sock(sk);
 784 }
 785
 786 /*
 787  * This routine is called by the ICMP module when it gets some
 788  * sort of error condition.  If err < 0 then the socket should
 789  * be closed and the error returned to the user.  If err > 0
 790  * it's just the icmp type << 8 | icmp code.  After adjustment
 791  * header points to the first 8 bytes of the tcp header.  We need
 792  * to find the appropriate port.
 793  *
 794  * The locking strategy used here is very "optimistic". When
 795  * someone else accesses the socket the ICMP is just dropped
 796  * and for some paths there is no check at all.
 797  * A more general error queue to queue errors for later handling
 798  * is probably better.
 799  *
 800  * sk->err and sk->err_soft should be atomic_t.
 801  */
 802
 803 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 804 {
 805         struct iphdr *iph = (struct iphdr*)dp;
 806         struct tcphdr *th;
 807         struct tcp_opt *tp;
 808         int type = skb->h.icmph->type;
 809         int code = skb->h.icmph->code;
 810 #if ICMP_MIN_LENGTH < 14
 811         int no_flags = 0;
 812 #else
 813 #define no_flags 0
 814 #endif
 815         struct sock *sk;
 816         __u32 seq;
 817         int err;
 818
 819         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 820                 icmp_statistics.IcmpInErrors++;
 821                 return;
 822         }
 823 #if ICMP_MIN_LENGTH < 14
 824         if (len < (iph->ihl << 2) + 14)
 825                 no_flags = 1;
 826 #endif
 827
 828         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 829
 830         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
 831         if (sk == NULL || sk->state == TCP_TIME_WAIT) {
 832                 icmp_statistics.IcmpInErrors++;
 833                 return;
 834         }
 835
 836         tp = &sk->tp_pinfo.af_tcp;
 837         seq = ntohl(th->seq);
 838         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
 839                 net_statistics.OutOfWindowIcmps++;
 840                 return;
 841         }
 842
 843         switch (type) {
 844         case ICMP_SOURCE_QUENCH:
 845 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
 846                 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 847                 tp->snd_cwnd = tp->snd_ssthresh;
 848                 tp->snd_cwnd_cnt = 0;
 849                 tp->high_seq = tp->snd_nxt;
 850 #endif
 851                 return;
 852         case ICMP_PARAMETERPROB:
 853                 err = EPROTO;
 854                 break;
 855         case ICMP_DEST_UNREACH:
 856                 if (code > NR_ICMP_UNREACH)
 857                         return;
 858
 859                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 860                         do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
 861                         return;
 862                 }
 863
 864                 err = icmp_err_convert[code].errno;
 865                 break;
 866         case ICMP_TIME_EXCEEDED:
 867                 err = EHOSTUNREACH;
 868                 break;
 869         default:
 870                 return;
 871         }
 872
 873         switch (sk->state) {
 874                 struct open_request *req, *prev;
 875         case TCP_LISTEN:
 876                 /* The final ACK of the handshake should be already
 877                  * handled in the new socket context, not here.
 878                  * Strictly speaking - an ICMP error for the final
 879                  * ACK should set the opening flag, but that is too
 880                  * complicated right now.
 881                  */
 882                 if (!no_flags && !th->syn && !th->ack)
 883                         return;
 884
 885                 /* Prevent race conditions with accept() -
 886                  * ICMP is unreliable.
 887                  */
 888                 bh_lock_sock(sk);
 889                 if (sk->lock.users != 0) {
 890                         net_statistics.LockDroppedIcmps++;
 891                          /* If too many ICMPs get dropped on busy
 892                           * servers this needs to be solved differently.
 893                           */
 894                         goto out_unlock;
 895                 }
 896
 897                 req = tcp_v4_search_req(tp, iph, th, &prev);
 898                 if (!req)
 899                         goto out_unlock;
 900                 if (seq != req->snt_isn) {
 901                         net_statistics.OutOfWindowIcmps++;
 902                         goto out_unlock;
 903                 }
 904                 if (req->sk) {
 905                         /*
 906                          * Already in ESTABLISHED and a big socket is created,
 907                          * set error code there.
 908                          * The error will _not_ be reported in the accept(),
 909                          * but only with the next operation on the socket after
 910                          * accept.
 911                          */
 912                         bh_unlock_sock(sk);
 913                         sk = req->sk;
 914                 } else {
 915                         /*
 916                          * Still in SYN_RECV, just remove it silently.
 917                          * There is no good way to pass the error to the newly
 918                          * created socket, and POSIX does not want network
 919                          * errors returned from accept().
 920                          */
 921                         tp->syn_backlog--;
 922                         tcp_synq_unlink(tp, req, prev);
 923                         req->class->destructor(req);
 924                         tcp_openreq_free(req);
 925         out_unlock:
 926                         bh_unlock_sock(sk);
 927                         return;
 928                 }
 929                 break;
 930         case TCP_SYN_SENT:
 931         case TCP_SYN_RECV:  /* Cannot happen */
 932                 if (!no_flags && !th->syn)
 933                         return;
 934                 tcp_statistics.TcpAttemptFails++;
 935                 sk->err = err;
 936                 sk->zapped = 1;
 937                 mb();
 938                 sk->error_report(sk);
 939                 return;
 940         }
 941
 942         /* If we've already connected we will keep trying
 943          * until we time out, or the user gives up.
 944          *
 945          * rfc1122 4.2.3.9 allows to consider as hard errors
 946          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 947          * but it is obsoleted by pmtu discovery).
 948          *
 949          * Note, that in modern internet, where routing is unreliable
 950          * and in each dark corner broken firewalls sit, sending random
 951          * errors ordered by their masters even this two messages finally lose
 952          * their original sense (even Linux sends invalid PORT_UNREACHs)
 953          *
 954          * Now we are in compliance with RFCs.
 955          *                                                      --ANK (980905)
 956          */
 957
 958         if (sk->ip_recverr) {
 959                 /* This code isn't serialized with the socket code */
 960                 /* ANK (980927) ... which is harmless now,
 961                    sk->err's may be safely lost.
 962                  */
 963                 sk->err = err;
 964                 mb();
 965                 sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 966         } else  { /* Only an error on timeout */
 967                 sk->err_soft = err;
 968                 mb();
 969         }
 970 }
 971
 972 /* This routine computes an IPv4 TCP checksum. */
 973 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 974                        struct sk_buff *skb)
 975 {
 976         th->check = 0;
 977         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
 978                                  csum_partial((char *)th, th->doff<<2, skb->csum));
 979 }
 980
 981 /*
 982  *      This routine will send an RST to the other tcp.
 983  *
 984  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 985  *                    for reset.
 986  *      Answer: if a packet caused RST, it is not for a socket
 987  *              existing in our system, if it is matched to a socket,
 988  *              it is just duplicate segment or bug in other side's TCP.
 989  *              So that we build reply only basing on parameters
 990  *              arrived with segment.
 991  *      Exception: precedence violation. We do not implement it in any case.
 992  */
 993
 994 static void tcp_v4_send_reset(struct sk_buff *skb)
 995 {
 996         struct tcphdr *th = skb->h.th;
 997         struct tcphdr rth;
 998         struct ip_reply_arg arg;
 999
1000         /* Never send a reset in response to a reset. */
1001         if (th->rst)
1002                 return;
1003
1004         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
1005 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1006                 if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
1007                         icmp_send(skb, ICMP_DEST_UNREACH,
1008                                   ICMP_PORT_UNREACH, 0);
1009 #endif
1010                 return;
1011         }
1012
1013         /* Swap the send and the receive. */
1014         memset(&rth, 0, sizeof(struct tcphdr));
1015         rth.dest = th->source;
1016         rth.source = th->dest;
1017         rth.doff = sizeof(struct tcphdr)/4;
1018         rth.rst = 1;
1019
1020         if (th->ack) {
1021                 rth.seq = th->ack_seq;
1022         } else {
1023                 rth.ack = 1;
1024                 rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
1025         }
1026
1027         memset(&arg, 0, sizeof arg);
1028         arg.iov[0].iov_base = (unsigned char *)&rth;
1029         arg.iov[0].iov_len  = sizeof rth;
1030         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1031                                       skb->nh.iph->saddr, /*XXX*/
1032                                       sizeof(struct tcphdr),
1033                                       IPPROTO_TCP,
1034                                       0);
1035         arg.n_iov = 1;
1036         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1037
1038         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1039
1040         tcp_statistics.TcpOutSegs++;
1041         tcp_statistics.TcpOutRsts++;
1042 }
1043
1044 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1045
1046 /*
1047    Seems, I never wrote nothing more stupid.
1048    I hope Gods will forgive me, but I cannot forgive myself 8)
1049                                                 --ANK (981001)
1050  */
1051
1052 static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
1053 {
1054         struct iphdr *iph = skb->nh.iph;
1055         struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1056         struct sock *sk = NULL;
1057         int i;
1058
1059         SOCKHASH_LOCK_READ();
1060         for (i=0; i<TCP_LHTABLE_SIZE; i++) {
1061                 for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
1062                         struct open_request *dummy;
1063                         if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
1064                                               th, &dummy) &&
1065                             (!sk->bound_dev_if ||
1066                              sk->bound_dev_if == skb->dev->ifindex))
1067                                 goto out;
1068                 }
1069         }
1070 out:
1071         SOCKHASH_UNLOCK_READ();
1072         return sk;
1073 }
1074
1075 /*
1076  *      Check whether a received TCP packet might be for one of our
1077  *      connections.
1078  */
1079
1080 int tcp_chkaddr(struct sk_buff *skb)
1081 {
1082         struct iphdr *iph = skb->nh.iph;
1083         struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1084         struct sock *sk;
1085
1086         sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
1087                            th->dest, skb->dev->ifindex);
1088
1089         if (!sk)
1090                 return tcp_v4_search_proxy_openreq(skb) != NULL;
1091
1092         if (sk->state == TCP_LISTEN) {
1093                 struct open_request *dummy;
1094                 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
1095                                       th, &dummy) &&
1096                     (!sk->bound_dev_if ||
1097                      sk->bound_dev_if == skb->dev->ifindex))
1098                         return 1;
1099         }
1100
1101         /* 0 means accept all LOCAL addresses here, not all the world... */
1102
1103         if (sk->rcv_saddr == 0)
1104                 return 0;
1105
1106         return 1;
1107 }
1108 #endif
1109
1110 /*
1111  *      Send a SYN-ACK after having received an ACK.
1112  *      This still operates on a open_request only, not on a big
1113  *      socket.
1114  */
1115 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1116 {
1117         struct rtable *rt;
1118         struct ip_options *opt;
1119         struct sk_buff * skb;
1120         int mss;
1121
1122         /* First, grab a route. */
1123         opt = req->af.v4_req.opt;
1124         if(ip_route_output(&rt, ((opt && opt->srr) ?
1125                                  opt->faddr :
1126                                  req->af.v4_req.rmt_addr),
1127                            req->af.v4_req.loc_addr,
1128                            RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
1129                            sk->bound_dev_if)) {
1130                 ip_statistics.IpOutNoRoutes++;
1131                 return;
1132         }
1133         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1134                 ip_rt_put(rt);
1135                 ip_statistics.IpOutNoRoutes++;
1136                 return;
1137         }
1138
1139         mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1140
1141         skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
1142         if (skb) {
1143                 struct tcphdr *th = skb->h.th;
1144
1145 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1146                 th->source = req->lcl_port; /* LVE */
1147 #endif
1148
1149                 th->check = tcp_v4_check(th, skb->len,
1150                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1151                                          csum_partial((char *)th, skb->len, skb->csum));
1152
1153                 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1154                                       req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1155         }
1156         ip_rt_put(rt);
1157 }
1158
1159 /*
1160  *      IPv4 open_request destructor.
1161  */
1162 static void tcp_v4_or_free(struct open_request *req)
1163 {
1164         if(!req->sk && req->af.v4_req.opt)
1165                 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1166 }
1167
1168 static inline void syn_flood_warning(struct sk_buff *skb)
1169 {
1170         static unsigned long warntime;
1171
1172         if (jiffies - warntime > HZ*60) {
1173                 warntime = jiffies;
1174                 printk(KERN_INFO
1175                        "possible SYN flooding on port %d. Sending cookies.\n",
1176                        ntohs(skb->h.th->dest));
1177         }
1178 }
1179
1180 /*
1181  * Save and compile IPv4 options into the open_request if needed.
1182  */
1183 static inline struct ip_options *
1184 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1185 {
1186         struct ip_options *opt = &(IPCB(skb)->opt);
1187         struct ip_options *dopt = NULL;
1188
1189         if (opt && opt->optlen) {
1190                 int opt_size = optlength(opt);
1191                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1192                 if (dopt) {
1193                         if (ip_options_echo(dopt, skb)) {
1194                                 kfree_s(dopt, opt_size);
1195                                 dopt = NULL;
1196                         }
1197                 }
1198         }
1199         return dopt;
1200 }
1201
1202 /*
1203  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1204  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1205  * It would be better to replace it with a global counter for all sockets
1206  * but then some measure against one socket starving all other sockets
1207  * would be needed.
1208  */
1209 int sysctl_max_syn_backlog = 128;
1210
1211 struct or_calltable or_ipv4 = {
1212         tcp_v4_send_synack,
1213         tcp_v4_or_free,
1214         tcp_v4_send_reset
1215 };
1216
1217 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1218 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1219
1220 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
1221 {
1222         struct tcp_opt tp;
1223         struct open_request *req;
1224         struct tcphdr *th = skb->h.th;
1225         __u32 saddr = skb->nh.iph->saddr;
1226         __u32 daddr = skb->nh.iph->daddr;
1227 #ifdef CONFIG_SYN_COOKIES
1228         int want_cookie = 0;
1229 #else
1230 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1231 #endif
1232
1233         /* If the socket is dead, don't accept the connection.  */
1234         if (sk->dead)
1235                 goto dead;
1236
1237         /* Never answer to SYNs send to broadcast or multicast */
1238         if (((struct rtable *)skb->dst)->rt_flags &
1239             (RTCF_BROADCAST|RTCF_MULTICAST))
1240                 goto drop;
1241
1242         /* XXX: Check against a global syn pool counter. */
1243         if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1244 #ifdef CONFIG_SYN_COOKIES
1245                 if (sysctl_tcp_syncookies) {
1246                         syn_flood_warning(skb);
1247                         want_cookie = 1;
1248                 } else
1249 #endif
1250                 goto drop;
1251         } else {
1252                 if (isn == 0)
1253                         isn = tcp_v4_init_sequence(sk, skb);
1254                 BACKLOG(sk)++;
1255         }
1256
1257         req = tcp_openreq_alloc();
1258         if (req == NULL) {
1259                 goto dropbacklog;
1260         }
1261
1262         req->rcv_wnd = 0;               /* So that tcp_send_synack() knows! */
1263
1264         req->rcv_isn = TCP_SKB_CB(skb)->seq;
1265         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1266
1267         tp.mss_clamp = 65535;
1268         tcp_parse_options(NULL, th, &tp, want_cookie);
1269         if (tp.mss_clamp == 65535)
1270                 tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
1271
1272         if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
1273                 tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
1274         req->mss = tp.mss_clamp;
1275
1276         if (tp.saw_tstamp)
1277                 req->ts_recent = tp.rcv_tsval;
1278         req->tstamp_ok = tp.tstamp_ok;
1279         req->sack_ok = tp.sack_ok;
1280         req->snd_wscale = tp.snd_wscale;
1281         req->wscale_ok = tp.wscale_ok;
1282         req->rmt_port = th->source;
1283 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1284         req->lcl_port = th->dest ; /* LVE */
1285 #endif
1286         req->af.v4_req.loc_addr = daddr;
1287         req->af.v4_req.rmt_addr = saddr;
1288
1289         /* Note that we ignore the isn passed from the TIME_WAIT
1290          * state here. That's the price we pay for cookies.
1291          */
1292         if (want_cookie)
1293                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1294
1295         req->snt_isn = isn;
1296
1297         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1298
1299         req->class = &or_ipv4;
1300         req->retrans = 0;
1301         req->sk = NULL;
1302
1303         tcp_v4_send_synack(sk, req);
1304
1305         if (want_cookie) {
1306                 if (req->af.v4_req.opt)
1307                         kfree(req->af.v4_req.opt);
1308                 tcp_v4_or_free(req);
1309                 tcp_openreq_free(req);
1310         } else {
1311                 req->expires = jiffies + TCP_TIMEOUT_INIT;
1312                 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1313                 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1314         }
1315
1316         return 0;
1317
1318 dead:
1319         SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1320         tcp_statistics.TcpAttemptFails++;
1321         return -ENOTCONN; /* send reset */
1322
1323 dropbacklog:
1324         if (!want_cookie)
1325                 BACKLOG(sk)--;
1326 drop:
1327         tcp_statistics.TcpAttemptFails++;
1328         return 0;
1329 }
1330
1331 /* This is not only more efficient than what we used to do, it eliminates
1332  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
1333  *
1334  * This function wants to be moved to a common for IPv[46] file. --ANK
1335  */
1336 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
1337 {
1338         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
1339
1340         if(newsk != NULL) {
1341                 struct tcp_opt *newtp;
1342 #ifdef CONFIG_FILTER
1343                 struct sk_filter *filter;
1344 #endif
1345
1346                 memcpy(newsk, sk, sizeof(*newsk));
1347                 newsk->sklist_next = NULL;
1348                 newsk->state = TCP_SYN_RECV;
1349
1350                 /* Clone the TCP header template */
1351                 newsk->dport = req->rmt_port;
1352
1353                 sock_lock_init(newsk);
1354
1355                 atomic_set(&newsk->rmem_alloc, 0);
1356                 skb_queue_head_init(&newsk->receive_queue);
1357                 atomic_set(&newsk->wmem_alloc, 0);
1358                 skb_queue_head_init(&newsk->write_queue);
1359                 atomic_set(&newsk->omem_alloc, 0);
1360
1361                 newsk->done = 0;
1362                 newsk->proc = 0;
1363                 newsk->pair = NULL;
1364                 newsk->backlog.head = newsk->backlog.tail = NULL;
1365                 skb_queue_head_init(&newsk->error_queue);
1366 #ifdef CONFIG_FILTER
1367                 if ((filter = newsk->filter) != NULL)
1368                         sk_filter_charge(newsk, filter);
1369 #endif
1370
1371                 /* Now setup tcp_opt */
1372                 newtp = &(newsk->tp_pinfo.af_tcp);
1373                 newtp->pred_flags = 0;
1374                 newtp->rcv_nxt = req->rcv_isn + 1;
1375                 newtp->snd_nxt = req->snt_isn + 1;
1376                 newtp->snd_una = req->snt_isn + 1;
1377                 newtp->srtt = 0;
1378                 newtp->ato = 0;
1379                 newtp->snd_wl1 = req->rcv_isn;
1380                 newtp->snd_wl2 = req->snt_isn;
1381
1382                 /* RFC1323: The window in SYN & SYN/ACK segments
1383                  * is never scaled.
1384                  */
1385                 newtp->snd_wnd = ntohs(skb->h.th->window);
1386
1387                 newtp->max_window = newtp->snd_wnd;
1388                 newtp->pending = 0;
1389                 newtp->retransmits = 0;
1390                 newtp->last_ack_sent = req->rcv_isn + 1;
1391                 newtp->backoff = 0;
1392                 newtp->mdev = TCP_TIMEOUT_INIT;
1393
1394                 /* So many TCP implementations out there (incorrectly) count the
1395                  * initial SYN frame in their delayed-ACK and congestion control
1396                  * algorithms that we must have the following bandaid to talk
1397                  * efficiently to them.  -DaveM
1398                  */
1399                 newtp->snd_cwnd = 2;
1400
1401                 newtp->rto = TCP_TIMEOUT_INIT;
1402                 newtp->packets_out = 0;
1403                 newtp->fackets_out = 0;
1404                 newtp->retrans_out = 0;
1405                 newtp->high_seq = 0;
1406                 newtp->snd_ssthresh = 0x7fffffff;
1407                 newtp->snd_cwnd_cnt = 0;
1408                 newtp->dup_acks = 0;
1409                 newtp->delayed_acks = 0;
1410                 init_timer(&newtp->retransmit_timer);
1411                 newtp->retransmit_timer.function = &tcp_retransmit_timer;
1412                 newtp->retransmit_timer.data = (unsigned long) newsk;
1413                 init_timer(&newtp->delack_timer);
1414                 newtp->delack_timer.function = &tcp_delack_timer;
1415                 newtp->delack_timer.data = (unsigned long) newsk;
1416                 skb_queue_head_init(&newtp->out_of_order_queue);
1417                 newtp->send_head = newtp->retrans_head = NULL;
1418                 newtp->rcv_wup = req->rcv_isn + 1;
1419                 newtp->write_seq = req->snt_isn + 1;
1420                 newtp->copied_seq = req->rcv_isn + 1;
1421
1422                 newtp->saw_tstamp = 0;
1423                 newtp->mss_clamp = req->mss;
1424
1425                 init_timer(&newtp->probe_timer);
1426                 newtp->probe_timer.function = &tcp_probe_timer;
1427                 newtp->probe_timer.data = (unsigned long) newsk;
1428                 newtp->probes_out = 0;
1429                 newtp->syn_seq = req->rcv_isn;
1430                 newtp->fin_seq = req->rcv_isn;
1431                 newtp->urg_data = 0;
1432                 tcp_synq_init(newtp);
1433                 newtp->syn_backlog = 0;
1434                 if (skb->len >= 536)
1435                         newtp->last_seg_size = skb->len;
1436
1437                 /* Back to base struct sock members. */
1438                 newsk->err = 0;
1439                 newsk->ack_backlog = 0;
1440                 newsk->max_ack_backlog = SOMAXCONN;
1441                 newsk->priority = 0;
1442
1443                 /* IP layer stuff */
1444                 newsk->timeout = 0;
1445                 init_timer(&newsk->timer);
1446                 newsk->timer.function = &net_timer;
1447                 newsk->timer.data = (unsigned long) newsk;
1448                 newsk->socket = NULL;
1449
1450                 newtp->tstamp_ok = req->tstamp_ok;
1451                 if((newtp->sack_ok = req->sack_ok) != 0)
1452                         newtp->num_sacks = 0;
1453                 newtp->window_clamp = req->window_clamp;
1454                 newtp->rcv_wnd = req->rcv_wnd;
1455                 newtp->wscale_ok = req->wscale_ok;
1456                 if (newtp->wscale_ok) {
1457                         newtp->snd_wscale = req->snd_wscale;
1458                         newtp->rcv_wscale = req->rcv_wscale;
1459                 } else {
1460                         newtp->snd_wscale = newtp->rcv_wscale = 0;
1461                         newtp->window_clamp = min(newtp->window_clamp,65535);
1462                 }
1463                 if (newtp->tstamp_ok) {
1464                         newtp->ts_recent = req->ts_recent;
1465                         newtp->ts_recent_stamp = tcp_time_stamp;
1466                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
1467                 } else {
1468                         newtp->tcp_header_len = sizeof(struct tcphdr);
1469                 }
1470         }
1471         return newsk;
1472 }
1473
1474 /*
1475  * The three way handshake has completed - we got a valid synack -
1476  * now create the new socket.
1477  */
1478 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1479                                    struct open_request *req,
1480                                    struct dst_entry *dst)
1481 {
1482         struct ip_options *opt = req->af.v4_req.opt;
1483         struct tcp_opt *newtp;
1484         struct sock *newsk;
1485
1486         if (sk->ack_backlog > sk->max_ack_backlog)
1487                 goto exit; /* head drop */
1488         if (dst == NULL) {
1489                 struct rtable *rt;
1490
1491                 if (ip_route_output(&rt,
1492                         opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1493                         req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
1494                         return NULL;
1495                 dst = &rt->u.dst;
1496         }
1497 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1498         /* The new socket created for transparent proxy may fall
1499          * into a non-existed bind bucket because sk->num != newsk->num.
1500          * Ensure existance of the bucket now. The placement of the check
1501          * later will require to destroy just created newsk in the case of fail.
1502          * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
1503          */
1504         if (tcp_bucket_check(ntohs(skb->h.th->dest)))
1505                 goto exit;
1506 #endif
1507
1508         newsk = tcp_create_openreq_child(sk, req, skb);
1509         if (!newsk)
1510                 goto exit;
1511
1512         sk->tp_pinfo.af_tcp.syn_backlog--;
1513         sk->ack_backlog++;
1514
1515         newsk->dst_cache = dst;
1516
1517         newtp = &(newsk->tp_pinfo.af_tcp);
1518         newsk->daddr = req->af.v4_req.rmt_addr;
1519         newsk->saddr = req->af.v4_req.loc_addr;
1520         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1521 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1522         newsk->num = ntohs(skb->h.th->dest);
1523         newsk->sport = req->lcl_port;
1524 #endif
1525         newsk->opt = req->af.v4_req.opt;
1526         newtp->ext_header_len = 0;
1527         if (newsk->opt)
1528                 newtp->ext_header_len = newsk->opt->optlen;
1529
1530         tcp_sync_mss(newsk, dst->pmtu);
1531         newtp->rcv_mss = newtp->mss_clamp;
1532
1533         /* It would be better to use newtp->mss_clamp here */
1534         if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
1535                 newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
1536         if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
1537                 newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
1538
1539         tcp_v4_hash(newsk);
1540         add_to_prot_sklist(newsk);
1541         sk->data_ready(sk, 0); /* Deliver SIGIO */
1542
1543         return newsk;
1544
1545 exit:
1546         dst_release(dst);
1547         return NULL;
1548 }
1549
1550 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1551 {
1552         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1553         struct open_request *req, *prev;
1554
1555         req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1556         if (!req)
1557                 return;
1558         /* Sequence number check required by RFC793 */
1559         if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
1560             after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1561                 return;
1562         tcp_synq_unlink(tp, req, prev);
1563         (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
1564         req->class->destructor(req);
1565         tcp_openreq_free(req);
1566
1567         net_statistics.EmbryonicRsts++;
1568 }
1569
1570 /* Check for embryonic sockets (open_requests) We check packets with
1571  * only the SYN bit set against the open_request queue too: This
1572  * increases connection latency a bit, but is required to detect
1573  * retransmitted SYNs.
1574  */
1575 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1576 {
1577         struct tcphdr *th = skb->h.th;
1578         u32 flg = ((u32 *)th)[3];
1579
1580         /* Check for RST */
1581         if (flg & __constant_htonl(0x00040000)) {
1582                 tcp_v4_rst_req(sk, skb);
1583                 return NULL;
1584         }
1585
1586         /* Check for SYN|ACK */
1587         if (flg & __constant_htonl(0x00120000)) {
1588                 struct open_request *req, *dummy;
1589                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1590
1591                 /* Find possible connection requests. */
1592                 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1593                 if (req) {
1594                         sk = tcp_check_req(sk, skb, req);
1595                 }
1596 #ifdef CONFIG_SYN_COOKIES
1597                 else {
1598                         sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1599                 }
1600 #endif
1601         }
1602         return sk;
1603 }
1604
1605 /* The socket must have it's spinlock held when we get
1606  * here.
1607  *
1608  * We have a potential double-lock case here, so even when
1609  * doing backlog processing we use the BH locking scheme.
1610  * This is because we cannot sleep with the original spinlock
1611  * held.
1612  */
1613 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1614 {
1615         int need_unlock = 0;
1616 #ifdef CONFIG_FILTER
1617         struct sk_filter *filter = sk->filter;
1618         if (filter && sk_filter(skb, filter))
1619                 goto discard;
1620 #endif /* CONFIG_FILTER */
1621
1622         /*
1623          * This doesn't check if the socket has enough room for the packet.
1624          * Either process the packet _without_ queueing it and then free it,
1625          * or do the check later.
1626          */
1627         skb_set_owner_r(skb, sk);
1628
1629         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1630                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1631                         goto reset;
1632                 return 0;
1633         }
1634
1635         if (sk->state == TCP_LISTEN) {
1636                 struct sock *nsk;
1637
1638                 nsk = tcp_v4_hnd_req(sk, skb);
1639                 if (!nsk)
1640                         goto discard;
1641
1642                 /*
1643                  * Queue it on the new socket if the new socket is active,
1644                  * otherwise we just shortcircuit this and continue with
1645                  * the new socket..
1646                  */
1647                 if (nsk != sk) {
1648                         bh_lock_sock(nsk);
1649                         if (nsk->lock.users != 0) {
1650                                 skb_orphan(skb);
1651                                 sk_add_backlog(nsk, skb);
1652                                 bh_unlock_sock(nsk);
1653                                 return 0;
1654                         }
1655                         need_unlock = 1;
1656                         sk = nsk;
1657                 }
1658         }
1659
1660         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1661                 goto reset;
1662         goto out_maybe_unlock;
1663
1664 reset:
1665         tcp_v4_send_reset(skb);
1666 discard:
1667         kfree_skb(skb);
1668         /* Be careful here. If this function gets more complicated and
1669          * gcc suffers from register pressure on the x86, sk (in %ebx)
1670          * might be destroyed here. This current version compiles correctly,
1671          * but you have been warned.
1672          */
1673 out_maybe_unlock:
1674         if(need_unlock)
1675                 bh_unlock_sock(sk);
1676         return 0;
1677 }
1678
1679 /*
1680  *      From tcp_input.c
1681  */
1682
1683 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1684 {
1685         struct tcphdr *th;
1686         struct sock *sk;
1687         int ret;
1688
1689         if (skb->pkt_type!=PACKET_HOST)
1690                 goto discard_it;
1691
1692         th = skb->h.th;
1693
1694         /* Pull up the IP header. */
1695         __skb_pull(skb, skb->h.raw - skb->data);
1696
1697         /* Count it even if it's bad */
1698         tcp_statistics.TcpInSegs++;
1699
1700         if (len < sizeof(struct tcphdr))
1701                 goto bad_packet;
1702
1703         /* Try to use the device checksum if provided. */
1704         switch (skb->ip_summed) {
1705         case CHECKSUM_NONE:
1706                 skb->csum = csum_partial((char *)th, len, 0);
1707         case CHECKSUM_HW:
1708                 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1709                         NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1710                                         "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1711                                         "len=%d/%d/%d\n",
1712                                         NIPQUAD(skb->nh.iph->saddr),
1713                                         ntohs(th->source),
1714                                         NIPQUAD(skb->nh.iph->daddr),
1715                                         ntohs(th->dest),
1716                                         len, skb->len,
1717                                         ntohs(skb->nh.iph->tot_len)));
1718         bad_packet:
1719                         tcp_statistics.TcpInErrs++;
1720                         goto discard_it;
1721                 }
1722         default:
1723                 /* CHECKSUM_UNNECESSARY */
1724         }
1725
1726 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1727         if (IPCB(skb)->redirport)
1728                 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1729                                          skb->nh.iph->daddr, skb->dev,
1730                                          IPCB(skb)->redirport, skb->dev->ifindex);
1731         else {
1732 #endif
1733                 SOCKHASH_LOCK_READ_BH();
1734                 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1735                                      skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1736                 SOCKHASH_UNLOCK_READ_BH();
1737 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1738                 if (!sk)
1739                         sk = tcp_v4_search_proxy_openreq(skb);
1740         }
1741 #endif
1742         if (!sk)
1743                 goto no_tcp_socket;
1744         if(!ipsec_sk_policy(sk,skb))
1745                 goto discard_it;
1746
1747         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1748         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1749                                     len - th->doff*4);
1750         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1751
1752         skb->used = 0;
1753
1754         if (sk->state == TCP_TIME_WAIT)
1755                 goto do_time_wait;
1756
1757         bh_lock_sock(sk);
1758         ret = 0;
1759         if (!sk->lock.users)
1760                 ret = tcp_v4_do_rcv(sk, skb);
1761         else
1762                 sk_add_backlog(sk, skb);
1763         bh_unlock_sock(sk);
1764
1765         return ret;
1766
1767 no_tcp_socket:
1768         tcp_v4_send_reset(skb);
1769
1770 discard_it:
1771         /* Discard frame. */
1772         kfree_skb(skb);
1773         return 0;
1774
1775 do_time_wait:
1776         if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1777                                       skb, th, skb->len))
1778                 goto no_tcp_socket;
1779         goto discard_it;
1780 }
1781
1782 int tcp_v4_rebuild_header(struct sock *sk)
1783 {
1784         struct rtable *rt = (struct rtable *)sk->dst_cache;
1785         __u32 new_saddr;
1786         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1787
1788         if(rt == NULL)
1789                 return 0;
1790
1791         /* Force route checking if want_rewrite.
1792          * The idea is good, the implementation is disguisting.
1793          * Well, if I made bind on this socket, you cannot randomly ovewrite
1794          * its source address. --ANK
1795          */
1796         if (want_rewrite) {
1797                 int tmp;
1798                 struct rtable *new_rt;
1799                 __u32 old_saddr = rt->rt_src;
1800
1801                 /* Query new route using another rt buffer */
1802                 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1803                                         RT_TOS(sk->ip_tos)|sk->localroute,
1804                                         sk->bound_dev_if);
1805
1806                 /* Only useful if different source addrs */
1807                 if (tmp == 0) {
1808                         /*
1809                          *      Only useful if different source addrs
1810                          */
1811                         if (new_rt->rt_src != old_saddr ) {
1812                                 dst_release(sk->dst_cache);
1813                                 sk->dst_cache = &new_rt->u.dst;
1814                                 rt = new_rt;
1815                                 goto do_rewrite;
1816                         }
1817                         dst_release(&new_rt->u.dst);
1818                 }
1819         }
1820         if (rt->u.dst.obsolete) {
1821                 int err;
1822                 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1823                 if (err) {
1824                         sk->err_soft=-err;
1825                         sk->error_report(sk);
1826                         return -1;
1827                 }
1828                 dst_release(xchg(&sk->dst_cache, &rt->u.dst));
1829         }
1830
1831         return 0;
1832
1833 do_rewrite:
1834         new_saddr = rt->rt_src;
1835
1836         /* Ouch!, this should not happen. */
1837         if (!sk->saddr || !sk->rcv_saddr) {
1838                 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1839                        "saddr=%08lX rcv_saddr=%08lX\n",
1840                        ntohl(sk->saddr),
1841                        ntohl(sk->rcv_saddr));
1842                 return 0;
1843         }
1844
1845         if (new_saddr != sk->saddr) {
1846                 if (sysctl_ip_dynaddr > 1) {
1847                         printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1848                                "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1849                                NIPQUAD(sk->saddr),
1850                                NIPQUAD(new_saddr));
1851                 }
1852
1853                 sk->saddr = new_saddr;
1854                 sk->rcv_saddr = new_saddr;
1855                 tcp_v4_rehash(sk);
1856         }
1857
1858         return 0;
1859 }
1860
1861 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1862 {
1863         return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1864                              skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1865 }
1866
1867 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1868 {
1869         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1870
1871         sin->sin_family         = AF_INET;
1872         sin->sin_addr.s_addr    = sk->daddr;
1873         sin->sin_port           = sk->dport;
1874 }
1875
1876 struct tcp_func ipv4_specific = {
1877         ip_queue_xmit,
1878         tcp_v4_send_check,
1879         tcp_v4_rebuild_header,
1880         tcp_v4_conn_request,
1881         tcp_v4_syn_recv_sock,
1882         tcp_v4_get_sock,
1883         sizeof(struct iphdr),
1884
1885         ip_setsockopt,
1886         ip_getsockopt,
1887         v4_addr2sockaddr,
1888         sizeof(struct sockaddr_in)
1889 };
1890
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892  *       sk_alloc() so need not be done here.
1893  */
1894 static int tcp_v4_init_sock(struct sock *sk)
1895 {
1896         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1897
1898         skb_queue_head_init(&tp->out_of_order_queue);
1899         tcp_init_xmit_timers(sk);
1900
1901         tp->rto  = TCP_TIMEOUT_INIT;            /*TCP_WRITE_TIME*/
1902         tp->mdev = TCP_TIMEOUT_INIT;
1903         tp->mss_clamp = ~0;
1904
1905         /* So many TCP implementations out there (incorrectly) count the
1906          * initial SYN frame in their delayed-ACK and congestion control
1907          * algorithms that we must have the following bandaid to talk
1908          * efficiently to them.  -DaveM
1909          */
1910         tp->snd_cwnd = 2;
1911
1912         /* See draft-stevens-tcpca-spec-01 for discussion of the
1913          * initialization of these values.
1914          */
1915         tp->snd_cwnd_cnt = 0;
1916         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1917
1918         sk->state = TCP_CLOSE;
1919         sk->max_ack_backlog = SOMAXCONN;
1920         tp->rcv_mss = 536;
1921
1922         sk->write_space = tcp_write_space;
1923
1924         /* Init SYN queue. */
1925         tcp_synq_init(tp);
1926
1927         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1928
1929         return 0;
1930 }
1931
1932 static int tcp_v4_destroy_sock(struct sock *sk)
1933 {
1934         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1935         struct sk_buff *skb;
1936
1937         tcp_clear_xmit_timers(sk);
1938
1939         if (sk->keepopen)
1940                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1941
1942         /* Cleanup up the write buffer. */
1943         while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
1944                 kfree_skb(skb);
1945
1946         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1947         while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
1948                 kfree_skb(skb);
1949
1950         /* Clean up a locked TCP bind bucket, this only happens if a
1951          * port is allocated for a socket, but it never fully connects.
1952          * In which case we will find num to be non-zero and daddr to
1953          * be zero.
1954          */
1955         if(sk->daddr == 0 && sk->num != 0)
1956                 tcp_bucket_unlock(sk);
1957
1958         return 0;
1959 }
1960
1961 struct proto tcp_prot = {
1962         (struct sock *)&tcp_prot,       /* sklist_next */
1963         (struct sock *)&tcp_prot,       /* sklist_prev */
1964         tcp_close,                      /* close */
1965         tcp_v4_connect,                 /* connect */
1966         tcp_accept,                     /* accept */
1967         NULL,                           /* retransmit */
1968         tcp_write_wakeup,               /* write_wakeup */
1969         tcp_read_wakeup,                /* read_wakeup */
1970         tcp_poll,                       /* poll */
1971         tcp_ioctl,                      /* ioctl */
1972         tcp_v4_init_sock,               /* init */
1973         tcp_v4_destroy_sock,            /* destroy */
1974         tcp_shutdown,                   /* shutdown */
1975         tcp_setsockopt,                 /* setsockopt */
1976         tcp_getsockopt,                 /* getsockopt */
1977         tcp_v4_sendmsg,                 /* sendmsg */
1978         tcp_recvmsg,                    /* recvmsg */
1979         NULL,                           /* bind */
1980         tcp_v4_do_rcv,                  /* backlog_rcv */
1981         tcp_v4_hash,                    /* hash */
1982         tcp_v4_unhash,                  /* unhash */
1983         tcp_v4_rehash,                  /* rehash */
1984         tcp_good_socknum,               /* good_socknum */
1985         tcp_v4_verify_bind,             /* verify_bind */
1986         128,                            /* max_header */
1987         0,                              /* retransmits */
1988         "TCP",                          /* name */
1989         0,                              /* inuse */
1990         0                               /* highestinuse */
1991 };
1992
1993
1994
1995 __initfunc(void tcp_v4_init(struct net_proto_family *ops))
1996 {
1997         int err;
1998
1999         tcp_inode.i_mode = S_IFSOCK;
2000         tcp_inode.i_sock = 1;
2001         tcp_inode.i_uid = 0;
2002         tcp_inode.i_gid = 0;
2003         init_waitqueue_head(&tcp_inode.i_wait);
2004         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2005
2006         tcp_socket->inode = &tcp_inode;
2007         tcp_socket->state = SS_UNCONNECTED;
2008         tcp_socket->type=SOCK_RAW;
2009
2010         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2011                 panic("Failed to create the TCP control socket.\n");
2012         tcp_socket->sk->allocation=GFP_ATOMIC;
2013         tcp_socket->sk->num = 256;              /* Don't receive any data */
2014         tcp_socket->sk->ip_ttl = MAXTTL;
2015 }