net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.76 1997/12/07 04:44:19 freitag Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an ACK bit.
  36  *              Andi Kleen :            Implemented fast path mtu discovery.
  37  *                                      Fixed many serious bugs in the
  38  *                                      open_request handling and moved
  39  *                                      most of it into the af independent code.
  40  *                                      Added tail drop and some other bugfixes.
  41  *                                      Added new listen sematics (ifdefed by
  42  *                                      NEW_LISTEN for now)
  43  *      Juan Jose Ciarlante:            ip_dynaddr bits
  44  */
  45
  46 #include <linux/config.h>
  47 #include <linux/types.h>
  48 #include <linux/fcntl.h>
  49 #include <linux/random.h>
  50 #include <linux/ipsec.h>
  51 #include <linux/inet.h>
  52
  53 #include <net/icmp.h>
  54 #include <net/tcp.h>
  55 #include <net/ipv6.h>
  56
  57 #include <asm/segment.h>
  58
  59 extern int sysctl_tcp_sack;
  60 extern int sysctl_tcp_tsack;
  61 extern int sysctl_tcp_timestamps;
  62 extern int sysctl_tcp_window_scaling;
  63 extern int sysctl_tcp_syncookies;
  64 extern int sysctl_ip_dynaddr;
  65
  66 /* Check TCP sequence numbers in ICMP packets. */
  67 #define ICMP_PARANOIA 1
  68 #ifndef ICMP_PARANOIA
  69 #define ICMP_MIN_LENGTH 4
  70 #else
  71 #define ICMP_MIN_LENGTH 8
  72 #endif
  73
  74 static void tcp_v4_send_reset(struct sk_buff *skb);
  75
  76 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  77                        struct sk_buff *skb);
  78
  79 /* This is for sockets with full identity only.  Sockets here will always
  80  * be without wildcards and will have the following invariant:
  81  *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
  82  *
  83  * First half of the table is for sockets not in TIME_WAIT, second half
  84  * is for TIME_WAIT sockets only.
  85  */
  86 struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
  87
  88 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  89  * where wildcard'd TCP sockets can exist.  Hash function here is just local
  90  * port number.
  91  */
  92 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
  93
  94 /* Ok, let's try this, I give up, we do need a local binding
  95  * TCP hash as well as the others for fast bind/connect.
  96  */
  97 struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105
 106 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 107                                  __u32 faddr, __u16 fport)
 108 {
 109         return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
 110 }
 111
 112 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 113 {
 114         __u32 laddr = sk->rcv_saddr;
 115         __u16 lport = sk->num;
 116         __u32 faddr = sk->daddr;
 117         __u16 fport = sk->dummy_th.dest;
 118
 119         return tcp_hashfn(laddr, lport, faddr, fport);
 120 }
 121
 122 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 123 {
 124         struct sock *sk2;
 125         int retval = 0, sk_reuse = sk->reuse;
 126
 127         SOCKHASH_LOCK();
 128         sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
 129         for(; sk2 != NULL; sk2 = sk2->bind_next) {
 130                 if((sk2->num == snum) && (sk2 != sk)) {
 131                         unsigned char state = sk2->state;
 132                         int sk2_reuse = sk2->reuse;
 133
 134                         /* Two sockets can be bound to the same port if they're
 135                          * bound to different interfaces.
 136                          */
 137
 138                         if(sk->bound_dev_if != sk2->bound_dev_if)
 139                                 continue;
 140
 141                         if(!sk2->rcv_saddr || !sk->rcv_saddr) {
 142                                 if((!sk2_reuse)                 ||
 143                                    (!sk_reuse)                  ||
 144                                    (state == TCP_LISTEN)) {
 145                                         retval = 1;
 146                                         break;
 147                                 }
 148                         } else if(sk2->rcv_saddr == sk->rcv_saddr) {
 149                                 if((!sk_reuse)                  ||
 150                                    (!sk2_reuse)                 ||
 151                                    (state == TCP_LISTEN)) {
 152                                         retval = 1;
 153                                         break;
 154                                 }
 155                         }
 156                 }
 157         }
 158         SOCKHASH_UNLOCK();
 159
 160         return retval;
 161 }
 162
 163 static __inline__ int tcp_lport_inuse(int num)
 164 {
 165         struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
 166
 167         for(; sk != NULL; sk = sk->bind_next) {
 168                 if(sk->num == num)
 169                         return 1;
 170         }
 171         return 0;
 172 }
 173
 174 /* Find a "good" local port, this is family independant.
 175  * There are several strategies working in unison here to
 176  * get the best possible performance.  The current socket
 177  * load is kept track of, if it is zero there is a strong
 178  * likely hood that there is a zero length chain we will
 179  * find with a small amount of searching, else the load is
 180  * what we shoot for for when the chains all have at least
 181  * one entry.  The base helps us walk the chains in an
 182  * order such that a good chain is found as quickly as possible.  -DaveM
 183  */
 184 unsigned short tcp_good_socknum(void)
 185 {
 186         static int start = 0;
 187         static int binding_contour = 0;
 188         int best = 0;
 189         int size = 32767; /* a big num. */
 190         int retval = 0, i, end, bc;
 191
 192         SOCKHASH_LOCK();
 193         if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
 194                 start = sysctl_local_port_range[0];
 195         i = tcp_bhashfn(start);
 196         end = i + TCP_BHTABLE_SIZE;
 197         bc = binding_contour;
 198         do {
 199                 struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
 200                 if(!sk) {
 201                         /* find the smallest value no smaller than start
 202                          * that has this hash value.
 203                          */
 204                         retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
 205
 206                         /* Check for decreasing load. */
 207                         if (bc != 0)
 208                                 binding_contour = 0;
 209                         goto done;
 210                 } else {
 211                         int j = 0;
 212                         do { sk = sk->bind_next; } while (++j < size && sk);
 213                         if (j < size) {
 214                                 best = i&(TCP_BHTABLE_SIZE-1);
 215                                 size = j;
 216                                 if (bc && size <= bc)
 217                                         goto verify;
 218                         }
 219                 }
 220         } while(++i != end);
 221         i = best;
 222
 223         /* Socket load is increasing, adjust our load average. */
 224         binding_contour = size;
 225 verify:
 226         if (size < binding_contour)
 227                 binding_contour = size;
 228
 229         retval = tcp_bhashnext(start-1,i);
 230
 231         best = retval;  /* mark the starting point to avoid infinite loops */
 232         while(tcp_lport_inuse(retval)) {
 233                 retval = tcp_bhashnext(retval,i);
 234                 if (retval > sysctl_local_port_range[1]) /* Upper bound */
 235                         retval = tcp_bhashnext(sysctl_local_port_range[0],i);
 236                 if (retval == best) {
 237                         /* This hash chain is full. No answer. */
 238                         retval = 0;
 239                         break;
 240                 }
 241         }
 242
 243 done:
 244         start = (retval + 1);
 245         SOCKHASH_UNLOCK();
 246
 247         return retval;
 248 }
 249
 250 static void tcp_v4_hash(struct sock *sk)
 251 {
 252         unsigned char state;
 253
 254         SOCKHASH_LOCK();
 255         state = sk->state;
 256         if(state != TCP_CLOSE || !sk->dead) {
 257                 struct sock **skp;
 258
 259                 if(state == TCP_LISTEN)
 260                         skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 261                 else
 262                         skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
 263
 264                 if((sk->next = *skp) != NULL)
 265                         (*skp)->pprev = &sk->next;
 266                 *skp = sk;
 267                 sk->pprev = skp;
 268                 tcp_sk_bindify(sk);
 269         }
 270         SOCKHASH_UNLOCK();
 271 }
 272
 273 static void tcp_v4_unhash(struct sock *sk)
 274 {
 275         SOCKHASH_LOCK();
 276         if(sk->pprev) {
 277                 if(sk->next)
 278                         sk->next->pprev = sk->pprev;
 279                 *sk->pprev = sk->next;
 280                 sk->pprev = NULL;
 281                 tcp_sk_unbindify(sk);
 282         }
 283         SOCKHASH_UNLOCK();
 284 }
 285
 286 static void tcp_v4_rehash(struct sock *sk)
 287 {
 288         unsigned char state;
 289
 290         SOCKHASH_LOCK();
 291         state = sk->state;
 292         if(sk->pprev) {
 293                 if(sk->next)
 294                         sk->next->pprev = sk->pprev;
 295                 *sk->pprev = sk->next;
 296                 sk->pprev = NULL;
 297                 tcp_sk_unbindify(sk);
 298         }
 299         if(state != TCP_CLOSE || !sk->dead) {
 300                 struct sock **skp;
 301
 302                 if(state == TCP_LISTEN) {
 303                         skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 304                 } else {
 305                         int hash= tcp_sk_hashfn(sk);
 306                         if(state == TCP_TIME_WAIT)
 307                                 hash += (TCP_HTABLE_SIZE/2);
 308                         skp = &tcp_established_hash[hash];
 309                 }
 310
 311                 if((sk->next = *skp) != NULL)
 312                         (*skp)->pprev = &sk->next;
 313                 *skp = sk;
 314                 sk->pprev = skp;
 315                 tcp_sk_bindify(sk);
 316         }
 317         SOCKHASH_UNLOCK();
 318 }
 319
 320 /* Don't inline this cruft.  Here are some nice properties to
 321  * exploit here.  The BSD API does not allow a listening TCP
 322  * to specify the remote port nor the remote address for the
 323  * connection.  So always assume those are both wildcarded
 324  * during the search since they can never be otherwise.
 325  */
 326 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
 327 {
 328         struct sock *sk;
 329         struct sock *result = NULL;
 330         int score, hiscore;
 331
 332         hiscore=0;
 333         for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
 334                 if(sk->num == hnum) {
 335                         __u32 rcv_saddr = sk->rcv_saddr;
 336
 337                         score = 1;
 338                         if(rcv_saddr) {
 339                                 if (rcv_saddr != daddr)
 340                                         continue;
 341                                 score++;
 342                         }
 343                         if (sk->bound_dev_if) {
 344                                 if (sk->bound_dev_if != dif)
 345                                         continue;
 346                                 score++;
 347                         }
 348                         if (score == 3)
 349                                 return sk;
 350                         if (score > hiscore) {
 351                                 hiscore = score;
 352                                 result = sk;
 353                         }
 354                 }
 355         }
 356         return result;
 357 }
 358
 359 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 360  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 361  */
 362 static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
 363                                            u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 364 {
 365         unsigned short hnum = ntohs(dport);
 366         struct sock *sk;
 367         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 368
 369         /* Optimize here for direct hit, only listening connections can
 370          * have wildcards anyways.  It is assumed that this code only
 371          * gets called from within NET_BH.
 372          */
 373         for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
 374                 if(sk->daddr            == saddr                && /* remote address */
 375                    sk->dummy_th.dest    == sport                && /* remote port    */
 376                    sk->num              == hnum                 && /* local port     */
 377                    sk->rcv_saddr        == daddr                && /* local address  */
 378                    (!sk->bound_dev_if || sk->bound_dev_if == dif))
 379                         goto hit; /* You sunk my battleship! */
 380
 381         /* Must check for a TIME_WAIT'er before going to listener hash. */
 382         for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
 383                 if(sk->daddr            == saddr                && /* remote address */
 384                    sk->dummy_th.dest    == sport                && /* remote port    */
 385                    sk->num              == hnum                 && /* local port     */
 386                    sk->rcv_saddr        == daddr                && /* local address  */
 387                    (!sk->bound_dev_if || sk->bound_dev_if == dif))
 388                         goto hit;
 389
 390         sk = tcp_v4_lookup_listener(daddr, hnum, dif);
 391 hit:
 392         return sk;
 393 }
 394
 395 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 396 {
 397         return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
 398 }
 399
 400 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 401 #define secondlist(hpnum, sk, fpass) \
 402 ({ struct sock *s1; if(!(sk) && (fpass)--) \
 403         s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
 404    else \
 405         s1 = (sk); \
 406    s1; \
 407 })
 408
 409 #define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
 410         secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
 411
 412 #define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
 413         secondlist((hpnum),(sk)->bind_next,(fpass))
 414
 415 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 416                                         unsigned short rnum, unsigned long laddr,
 417                                         struct device *dev, unsigned short pnum,
 418                                         int dif)
 419 {
 420         struct sock *s, *result = NULL;
 421         int badness = -1;
 422         u32 paddr = 0;
 423         unsigned short hnum = ntohs(num);
 424         unsigned short hpnum = ntohs(pnum);
 425         int firstpass = 1;
 426
 427         if(dev && dev->ip_ptr) {
 428                 struct in_device *idev = dev->ip_ptr;
 429
 430                 if(idev->ifa_list)
 431                         paddr = idev->ifa_list->ifa_local;
 432         }
 433
 434         /* This code must run only from NET_BH. */
 435         for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
 436             s != NULL;
 437             s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
 438                 if(s->num == hnum || s->num == hpnum) {
 439                         int score = 0;
 440                         if(s->dead && (s->state == TCP_CLOSE))
 441                                 continue;
 442                         if(s->rcv_saddr) {
 443                                 if((s->num != hpnum || s->rcv_saddr != paddr) &&
 444                                    (s->num != hnum || s->rcv_saddr != laddr))
 445                                         continue;
 446                                 score++;
 447                         }
 448                         if(s->daddr) {
 449                                 if(s->daddr != raddr)
 450                                         continue;
 451                                 score++;
 452                         }
 453                         if(s->dummy_th.dest) {
 454                                 if(s->dummy_th.dest != rnum)
 455                                         continue;
 456                                 score++;
 457                         }
 458                         if(s->bound_dev_if) {
 459                                 if(s->bound_dev_if != dif)
 460                                         continue;
 461                                 score++;
 462                         }
 463                         if(score == 4 && s->num == hnum) {
 464                                 result = s;
 465                                 break;
 466                         } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
 467                                         result = s;
 468                                         badness = score;
 469                         }
 470                 }
 471         }
 472         return result;
 473 }
 474
 475 #undef secondlist
 476 #undef tcp_v4_proxy_loop_init
 477 #undef tcp_v4_proxy_loop_next
 478
 479 #endif
 480
 481 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 482 {
 483         return secure_tcp_sequence_number(sk->saddr, sk->daddr,
 484                                           skb->h.th->dest,
 485                                           skb->h.th->source);
 486 }
 487
 488 /*
 489  *      From tcp.c
 490  */
 491
 492 /*
 493  * Check that a TCP address is unique, don't allow multiple
 494  * connects to/from the same address
 495  */
 496
 497 static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
 498 {
 499         int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
 500         struct sock * sk;
 501
 502         /* Make sure we are allowed to connect here.
 503          * But freeze the hash while we snoop around.
 504          */
 505         SOCKHASH_LOCK();
 506         sk = tcp_established_hash[hashent];
 507         for (; sk != NULL; sk = sk->next) {
 508                 if(sk->daddr            == daddr                && /* remote address */
 509                    sk->dummy_th.dest    == dnum                 && /* remote port */
 510                    sk->num              == snum                 && /* local port */
 511                    sk->saddr            == saddr) {                /* local address */
 512                         retval = 0;
 513                         goto out;
 514                 }
 515         }
 516
 517         /* Must check TIME_WAIT'ers too. */
 518         sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)];
 519         for (; sk != NULL; sk = sk->next) {
 520                 if(sk->daddr            == daddr                && /* remote address */
 521                    sk->dummy_th.dest    == dnum                 && /* remote port */
 522                    sk->num              == snum                 && /* local port */
 523                    sk->saddr            == saddr) {                /* local address */
 524                         retval = 0;
 525                         goto out;
 526                 }
 527         }
 528 out:
 529         SOCKHASH_UNLOCK();
 530         return retval;
 531 }
 532
 533
 534 /*
 535  *      This will initiate an outgoing connection.
 536  */
 537
 538 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 539 {
 540         struct sk_buff *buff;
 541         int tmp;
 542         struct tcphdr *th;
 543         struct rtable *rt;
 544         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 545         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 546
 547         if (sk->state != TCP_CLOSE)
 548                 return(-EISCONN);
 549
 550         /* Don't allow a double connect. */
 551         if (sk->daddr)
 552                 return -EINVAL;
 553
 554         if (addr_len < sizeof(struct sockaddr_in))
 555                 return(-EINVAL);
 556
 557         if (usin->sin_family != AF_INET) {
 558                 static int complained;
 559                 if (usin->sin_family)
 560                         return(-EAFNOSUPPORT);
 561                 if (!complained++)
 562                         printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
 563         }
 564
 565         if (sk->dst_cache) {
 566                 dst_release(sk->dst_cache);
 567                 sk->dst_cache = NULL;
 568         }
 569
 570         tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
 571                                RT_TOS(sk->ip_tos)|(sk->localroute || 0), sk->bound_dev_if);
 572         if (tmp < 0)
 573                 return tmp;
 574
 575         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
 576                 ip_rt_put(rt);
 577                 return -ENETUNREACH;
 578         }
 579
 580         if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
 581                                 usin->sin_port)) {
 582                 ip_rt_put(rt);
 583                 return -EADDRNOTAVAIL;
 584         }
 585
 586         lock_sock(sk);
 587
 588         /* Do this early, so there is less state to unwind on failure. */
 589         buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
 590         if (buff == NULL) {
 591                 release_sock(sk);
 592                 ip_rt_put(rt);
 593                 return(-ENOBUFS);
 594         }
 595
 596         sk->dst_cache = &rt->u.dst;
 597         sk->daddr = rt->rt_dst;
 598         if (!sk->saddr)
 599                 sk->saddr = rt->rt_src;
 600         sk->rcv_saddr = sk->saddr;
 601
 602         if (sk->priority == 0)
 603                 sk->priority = rt->u.dst.priority;
 604
 605         sk->dummy_th.dest = usin->sin_port;
 606
 607         sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 608                                                    sk->dummy_th.source,
 609                                                    usin->sin_port);
 610
 611         tp->snd_wnd = 0;
 612         tp->snd_wl1 = 0;
 613         tp->snd_wl2 = sk->write_seq;
 614         tp->snd_una = sk->write_seq;
 615
 616         tp->rcv_nxt = 0;
 617
 618         sk->err = 0;
 619
 620         /* Put in the IP header and routing stuff. */
 621         tmp = ip_build_header(buff, sk);
 622         if (tmp < 0) {
 623                 /* Caller has done ip_rt_put(rt) and set sk->dst_cache
 624                  * to NULL.  We must unwind the half built TCP socket
 625                  * state so that this failure does not create a "stillborn"
 626                  * sock (ie. future re-tries of connect() would fail).
 627                  */
 628                 sk->daddr = 0;
 629                 sk->saddr = sk->rcv_saddr = 0;
 630                 kfree_skb(buff, FREE_WRITE);
 631                 release_sock(sk);
 632                 return(-ENETUNREACH);
 633         }
 634
 635         /* No failure conditions can result past this point. */
 636
 637         th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
 638         buff->h.th = th;
 639
 640         memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
 641         buff->seq = sk->write_seq++;
 642         th->seq = htonl(buff->seq);
 643         tp->snd_nxt = sk->write_seq;
 644         buff->end_seq = sk->write_seq;
 645         th->ack = 0;
 646         th->syn = 1;
 647
 648         sk->mtu = rt->u.dst.pmtu;
 649         if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
 650              (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
 651               rt->rt_flags&RTCF_NOPMTUDISC)) &&
 652             rt->u.dst.pmtu > 576)
 653                 sk->mtu = 576;
 654
 655         if(sk->mtu < 64)
 656                 sk->mtu = 64;   /* Sanity limit */
 657
 658         if (sk->user_mss)
 659                 sk->mss = sk->user_mss;
 660         else
 661                 sk->mss = (sk->mtu - sizeof(struct iphdr) -
 662                            sizeof(struct tcphdr));
 663
 664         if (sk->mss < 1) {
 665                 printk(KERN_DEBUG "intial sk->mss below 1\n");
 666                 sk->mss = 1;    /* Sanity limit */
 667         }
 668
 669         tp->window_clamp = rt->u.dst.window;
 670         tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
 671                 &tp->rcv_wnd,
 672                 &tp->window_clamp,
 673                 sysctl_tcp_window_scaling,
 674                 &tp->rcv_wscale);
 675         th->window = htons(tp->rcv_wnd);
 676
 677         tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
 678                 sysctl_tcp_timestamps,
 679                 sysctl_tcp_window_scaling,tp->rcv_wscale);
 680         buff->csum = 0;
 681         th->doff = (sizeof(*th)+ tmp)>>2;
 682
 683         tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff);
 684
 685         tcp_set_state(sk,TCP_SYN_SENT);
 686
 687         /* Socket identity change complete, no longer
 688          * in TCP_CLOSE, so rehash.
 689          */
 690         tcp_v4_rehash(sk);
 691
 692         tp->rto = rt->u.dst.rtt;
 693
 694         tcp_init_xmit_timers(sk);
 695
 696         /* Now works the right way instead of a hacked initial setting. */
 697         tp->retransmits = 0;
 698
 699         skb_queue_tail(&sk->write_queue, buff);
 700
 701         tp->packets_out++;
 702         buff->when = jiffies;
 703
 704         ip_queue_xmit(skb_clone(buff, GFP_KERNEL));
 705
 706         /* Timer for repeating the SYN until an answer. */
 707         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 708         tcp_statistics.TcpActiveOpens++;
 709         tcp_statistics.TcpOutSegs++;
 710
 711         release_sock(sk);
 712         return(0);
 713 }
 714
 715 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 716 {
 717         int retval = -EINVAL;
 718
 719         /* Do sanity checking for sendmsg/sendto/send. */
 720         if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT))
 721                 goto out;
 722         if (msg->msg_name) {
 723                 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
 724
 725                 if (msg->msg_namelen < sizeof(*addr))
 726                         goto out;
 727                 if (addr->sin_family && addr->sin_family != AF_INET)
 728                         goto out;
 729                 retval = -ENOTCONN;
 730                 if(sk->state == TCP_CLOSE)
 731                         goto out;
 732                 retval = -EISCONN;
 733                 if (addr->sin_port != sk->dummy_th.dest)
 734                         goto out;
 735                 if (addr->sin_addr.s_addr != sk->daddr)
 736                         goto out;
 737         }
 738
 739         lock_sock(sk);
 740         retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
 741                                 msg->msg_flags);
 742
 743         release_sock(sk);
 744
 745 out:
 746         return retval;
 747 }
 748
 749
 750 /*
 751  * Do a linear search in the socket open_request list.
 752  * This should be replaced with a global hash table.
 753  */
 754 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 755                                               struct iphdr *iph,
 756                                               struct tcphdr *th,
 757                                               struct open_request **prevp)
 758 {
 759         struct open_request *req, *prev;
 760         __u16 rport = th->source;
 761
 762         /*      assumption: the socket is not in use.
 763          *      as we checked the user count on tcp_rcv and we're
 764          *      running from a soft interrupt.
 765          */
 766         prev = (struct open_request *) (&tp->syn_wait_queue);
 767         for (req = prev->dl_next; req; req = req->dl_next) {
 768                 if (req->af.v4_req.rmt_addr == iph->saddr &&
 769                     req->af.v4_req.loc_addr == iph->daddr &&
 770                     req->rmt_port == rport) {
 771                         *prevp = prev;
 772                         return req;
 773                 }
 774                 prev = req;
 775         }
 776         return NULL;
 777 }
 778
 779
 780 /*
 781  * This routine does path mtu discovery as defined in RFC1197.
 782  */
 783 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 784 {
 785         int new_mtu;
 786         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 787
 788         /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
 789          * send out by Linux are always <576bytes so they should go through
 790          * unfragmented).
 791          */
 792         if (sk->state == TCP_LISTEN)
 793                 return;
 794
 795         /* We don't check in the destentry if pmtu discovery is forbidden
 796          * on this route. We just assume that no packet_to_big packets
 797          * are send back when pmtu discovery is not active.
 798          * There is a small race when the user changes this flag in the
 799          * route, but I think that's acceptable.
 800          */
 801         if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
 802                 new_mtu = sk->dst_cache->pmtu -
 803                         (ip->ihl<<2) - tp->tcp_header_len;
 804                 if (new_mtu < sk->mss && new_mtu > 0) {
 805                         sk->mss = new_mtu;
 806                         /* Resend the TCP packet because it's
 807                          * clear that the old packet has been
 808                          * dropped. This is the new "fast" path mtu
 809                          * discovery.
 810                          */
 811                         if (!sk->sock_readers)
 812                                 tcp_simple_retransmit(sk);
 813                 }
 814         }
 815 }
 816
 817 /*
 818  * This routine is called by the ICMP module when it gets some
 819  * sort of error condition.  If err < 0 then the socket should
 820  * be closed and the error returned to the user.  If err > 0
 821  * it's just the icmp type << 8 | icmp code.  After adjustment
 822  * header points to the first 8 bytes of the tcp header.  We need
 823  * to find the appropriate port.
 824  */
 825
 826 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 827 {
 828         struct iphdr *iph = (struct iphdr*)dp;
 829         struct tcphdr *th;
 830         struct tcp_opt *tp;
 831         int type = skb->h.icmph->type;
 832         int code = skb->h.icmph->code;
 833         struct sock *sk;
 834         int opening;
 835 #ifdef ICMP_PARANOIA
 836         __u32 seq;
 837 #endif
 838
 839         if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
 840                 icmp_statistics.IcmpInErrors++;
 841                 return;
 842         }
 843
 844         th = (struct tcphdr*)(dp+(iph->ihl<<2));
 845
 846         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
 847         if (sk == NULL) {
 848                 icmp_statistics.IcmpInErrors++;
 849                 return;
 850         }
 851
 852         tp = &sk->tp_pinfo.af_tcp;
 853 #ifdef ICMP_PARANOIA
 854         seq = ntohl(th->seq);
 855         if (sk->state != TCP_LISTEN &&
 856             !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) {
 857                 if (net_ratelimit())
 858                         printk(KERN_DEBUG "icmp packet outside the tcp window:"
 859                                           " s:%d %u,%u,%u\n",
 860                                (int)sk->state, seq, tp->snd_una, tp->snd_nxt);
 861                 return;
 862         }
 863 #endif
 864
 865         switch (type) {
 866         case ICMP_SOURCE_QUENCH:
 867                 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
 868                 tp->snd_cwnd = tp->snd_ssthresh;
 869                 tp->high_seq = tp->snd_nxt;
 870                 return;
 871         case ICMP_PARAMETERPROB:
 872                 sk->err=EPROTO;
 873                 sk->error_report(sk);
 874                 break;
 875         case ICMP_DEST_UNREACH:
 876                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 877                         do_pmtu_discovery(sk, iph);
 878                         return;
 879                 }
 880                 break;
 881         }
 882
 883         /* If we've already connected we will keep trying
 884          * until we time out, or the user gives up.
 885          */
 886         if (code > NR_ICMP_UNREACH)
 887                 return;
 888
 889         opening = 0;
 890         switch (sk->state) {
 891                 struct open_request *req, *prev;
 892         case TCP_LISTEN:
 893                 /* Prevent race conditions with accept() -
 894                  * ICMP is unreliable.
 895                  */
 896                 if (sk->sock_readers) {
 897                         /* XXX: add a counter here to profile this.
 898                          * If too many ICMPs get dropped on busy
 899                          * servers this needs to be solved differently.
 900                          */
 901                         return;
 902                 }
 903
 904                 if (!th->syn && !th->ack)
 905                         return;
 906                 req = tcp_v4_search_req(tp, iph, th, &prev);
 907                 if (!req)
 908                         return;
 909 #ifdef ICMP_PARANOIA
 910                 if (seq != req->snt_isn) {
 911                         if (net_ratelimit())
 912                                 printk(KERN_DEBUG "icmp packet for openreq "
 913                                        "with wrong seq number:%d:%d\n",
 914                                        seq, req->snt_isn);
 915                         return;
 916                 }
 917 #endif
 918                 if (req->sk) {  /* not yet accept()ed */
 919                         sk = req->sk; /* report error in accept */
 920                 } else {
 921                         tcp_synq_unlink(tp, req, prev);
 922                         req->class->destructor(req);
 923                         tcp_openreq_free(req);
 924                 }
 925                 /* FALL THOUGH */
 926         case TCP_SYN_SENT:
 927         case TCP_SYN_RECV:
 928                 opening = 1;
 929                 break;
 930         }
 931
 932         if(icmp_err_convert[code].fatal || opening) {
 933                 sk->err = icmp_err_convert[code].errno;
 934                 if (opening) {
 935                         tcp_statistics.TcpAttemptFails++;
 936                         if (sk->state != TCP_LISTEN)
 937                                 tcp_set_state(sk,TCP_CLOSE);
 938                         sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
 939                 }
 940         } else  /* Only an error on timeout */
 941                 sk->err_soft = icmp_err_convert[code].errno;
 942 }
 943
 944 /* This routine computes an IPv4 TCP checksum. */
 945 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 946                        struct sk_buff *skb)
 947 {
 948         th->check = 0;
 949         th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
 950                                  csum_partial((char *)th, th->doff<<2, skb->csum));
 951 }
 952
 953 /*
 954  *      This routine will send an RST to the other tcp.
 955  *
 956  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 957  *                    for reset.
 958  *      Answer: if a packet caused RST, it is not for a socket
 959  *              existing in our system, if it is matched to a socket,
 960  *              it is just duplicate segment or bug in other side's TCP.
 961  *              So that we build reply only basing on parameters
 962  *              arrived with segment.
 963  *      Exception: precedence violation. We do not implement it in any case.
 964  */
 965
 966 static void tcp_v4_send_reset(struct sk_buff *skb)
 967 {
 968         struct tcphdr  *th = skb->h.th;
 969         struct sk_buff *skb1;
 970         struct tcphdr  *th1;
 971
 972         if (th->rst)
 973                 return;
 974
 975         skb1 = ip_reply(skb, sizeof(struct tcphdr));
 976         if (skb1 == NULL)
 977                 return;
 978
 979         skb1->h.th = th1 = (struct tcphdr *)skb_put(skb1, sizeof(struct tcphdr));
 980         memset(th1, 0, sizeof(*th1));
 981
 982         /* Swap the send and the receive. */
 983         th1->dest = th->source;
 984         th1->source = th->dest;
 985         th1->doff = sizeof(*th1)/4;
 986         th1->rst = 1;
 987
 988         if (th->ack)
 989                 th1->seq = th->ack_seq;
 990         else {
 991                 th1->ack = 1;
 992                 if (!th->syn)
 993                         th1->ack_seq = th->seq;
 994                 else
 995                         th1->ack_seq = htonl(ntohl(th->seq)+1);
 996         }
 997
 998         skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
 999         th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
1000                                   skb1->nh.iph->daddr, skb1->csum);
1001         /* FIXME: should this carry an options packet? */
1002         ip_queue_xmit(skb1);
1003         tcp_statistics.TcpOutSegs++;
1004         tcp_statistics.TcpOutRsts++;
1005 }
1006
1007 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1008 /*
1009  *      Check whether a received TCP packet might be for one of our
1010  *      connections.
1011  */
1012
1013 int tcp_chkaddr(struct sk_buff *skb)
1014 {
1015         struct iphdr *iph = skb->nh.iph;
1016         struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1017         struct sock *sk;
1018
1019         sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest, skb->dev->ifindex);
1020
1021         if (!sk)
1022                 return 0;
1023
1024         /* 0 means accept all LOCAL addresses here, not all the world... */
1025
1026         if (sk->rcv_saddr == 0)
1027                 return 0;
1028
1029         return 1;
1030 }
1031 #endif
1032
1033 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1034 {
1035         struct sk_buff * skb;
1036         struct tcphdr *th;
1037         int tmp;
1038         int mss;
1039
1040         skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
1041         if (skb == NULL)
1042                 return;
1043
1044         if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr,
1045                         req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) {
1046                 kfree_skb(skb, FREE_WRITE);
1047                 return;
1048         }
1049
1050         mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
1051         if (sk->user_mss)
1052                 mss = min(mss, sk->user_mss);
1053         skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
1054
1055         /* Don't offer more than they did.
1056          * This way we don't have to memorize who said what.
1057          * FIXME: maybe this should be changed for better performance
1058          * with syncookies.
1059          */
1060         req->mss = min(mss, req->mss);
1061
1062         if (req->mss < 1) {
1063                 printk(KERN_DEBUG "initial req->mss below 1\n");
1064                 req->mss = 1;
1065         }
1066
1067         /* Yuck, make this header setup more efficient... -DaveM */
1068         memset(th, 0, sizeof(struct tcphdr));
1069         th->syn = 1;
1070         th->ack = 1;
1071         th->source = sk->dummy_th.source;
1072         th->dest = req->rmt_port;
1073         skb->seq = req->snt_isn;
1074         skb->end_seq = skb->seq + 1;
1075         th->seq = htonl(skb->seq);
1076         th->ack_seq = htonl(req->rcv_isn + 1);
1077         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1078                 __u8 rcv_wscale;
1079                 /* Set this up on the first call only */
1080                 req->window_clamp = skb->dst->window;
1081                 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
1082                         &req->rcv_wnd,
1083                         &req->window_clamp,
1084                         req->wscale_ok,
1085                         &rcv_wscale);
1086                 req->rcv_wscale = rcv_wscale;
1087         }
1088         th->window = htons(req->rcv_wnd);
1089
1090         /* XXX Partial csum of 4 byte quantity is itself! -DaveM
1091          * Yes, but it's a bit harder to special case now. It's
1092          * now computed inside the tcp_v4_send_check() to clean up
1093          * updating the options fields in the mainline send code.
1094          * If someone thinks this is really bad let me know and
1095          * I'll try to do it a different way. -- erics
1096          */
1097
1098         tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
1099                 req->wscale_ok,req->rcv_wscale);
1100         skb->csum = 0;
1101         th->doff = (sizeof(*th) + tmp)>>2;
1102         th->check = tcp_v4_check(th, sizeof(*th) + tmp,
1103                                  req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1104                                  csum_partial((char *)th, sizeof(*th)+tmp, skb->csum));
1105
1106         ip_queue_xmit(skb);
1107         tcp_statistics.TcpOutSegs++;
1108 }
1109
1110 static void tcp_v4_or_free(struct open_request *req)
1111 {
1112         if(!req->sk && req->af.v4_req.opt)
1113                 kfree_s(req->af.v4_req.opt,
1114                         sizeof(struct ip_options) + req->af.v4_req.opt->optlen);
1115 }
1116
1117 static inline void syn_flood_warning(struct sk_buff *skb)
1118 {
1119         static unsigned long warntime;
1120
1121         if (jiffies - warntime > HZ*60) {
1122                 warntime = jiffies;
1123                 printk(KERN_INFO
1124                        "possible SYN flooding on port %d. Sending cookies.\n",
1125                        ntohs(skb->h.th->dest));
1126         }
1127 }
1128
1129 int sysctl_max_syn_backlog = 1024;
1130 int sysctl_tcp_syn_taildrop = 1;
1131
1132 struct or_calltable or_ipv4 = {
1133         tcp_v4_send_synack,
1134         tcp_v4_or_free,
1135         tcp_v4_send_reset
1136 };
1137
1138 #ifdef NEW_LISTEN
1139 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1140 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1141 #else
1142 #define BACKLOG(sk) ((sk)->ack_backlog)
1143 #define BACKLOGMAX(sk) ((sk)->max_ack_backlog)
1144 #endif
1145
1146 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
1147                                                 __u32 isn)
1148 {
1149         struct ip_options *opt = (struct ip_options *) ptr;
1150         struct tcp_opt tp;
1151         struct open_request *req;
1152         struct tcphdr *th = skb->h.th;
1153         __u32 saddr = skb->nh.iph->saddr;
1154         __u32 daddr = skb->nh.iph->daddr;
1155 #ifdef CONFIG_SYN_COOKIES
1156         int want_cookie = 0;
1157 #else
1158 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1159 #endif
1160
1161         /* If the socket is dead, don't accept the connection.  */
1162         if (sk->dead)
1163                 goto dead;
1164
1165         /* XXX: Check against a global syn pool counter. */
1166         if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1167 #ifdef CONFIG_SYN_COOKIES
1168                 if (sysctl_tcp_syncookies) {
1169                         syn_flood_warning(skb);
1170                         want_cookie = 1;
1171                 } else
1172 #endif
1173                 if (sysctl_tcp_syn_taildrop) {
1174                         struct open_request *req;
1175
1176                         req = tcp_synq_unlink_tail(&sk->tp_pinfo.af_tcp);
1177                         tcp_openreq_free(req);
1178                         tcp_statistics.TcpAttemptFails++;
1179                 } else {
1180                         goto error;
1181                 }
1182         } else {
1183                 if (isn == 0)
1184                         isn = tcp_v4_init_sequence(sk, skb);
1185                 BACKLOG(sk)++;
1186         }
1187
1188         req = tcp_openreq_alloc();
1189         if (req == NULL) {
1190                 if (!want_cookie) BACKLOG(sk)--;
1191                 goto error;
1192         }
1193
1194         req->rcv_wnd = 0;               /* So that tcp_send_synack() knows! */
1195
1196         req->rcv_isn = skb->seq;
1197         tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1198         tp.in_mss = 536;
1199         tcp_parse_options(th,&tp,want_cookie);
1200         if (tp.saw_tstamp)
1201                 req->ts_recent = tp.rcv_tsval;
1202         req->mss = tp.in_mss;
1203         req->tstamp_ok = tp.tstamp_ok;
1204         req->sack_ok = tp.sack_ok;
1205         req->snd_wscale = tp.snd_wscale;
1206         req->wscale_ok = tp.wscale_ok;
1207         req->rmt_port = th->source;
1208         req->af.v4_req.loc_addr = daddr;
1209         req->af.v4_req.rmt_addr = saddr;
1210
1211         /* Note that we ignore the isn passed from the TIME_WAIT
1212          * state here. That's the price we pay for cookies.
1213          */
1214         if (want_cookie)
1215                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1216
1217         req->snt_isn = isn;
1218
1219         /* IPv4 options */
1220         req->af.v4_req.opt = NULL;
1221
1222         if (opt && opt->optlen) {
1223                 int opt_size = sizeof(struct ip_options) + opt->optlen;
1224
1225                 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
1226                 if (req->af.v4_req.opt) {
1227                         if (ip_options_echo(req->af.v4_req.opt, skb)) {
1228                                 kfree_s(req->af.v4_req.opt, opt_size);
1229                                 req->af.v4_req.opt = NULL;
1230                         }
1231                 }
1232         }
1233         req->class = &or_ipv4;
1234         req->retrans = 0;
1235         req->sk = NULL;
1236
1237         tcp_v4_send_synack(sk, req);
1238
1239         if (want_cookie) {
1240                 if (req->af.v4_req.opt)
1241                         kfree(req->af.v4_req.opt);
1242                 tcp_openreq_free(req);
1243         } else  {
1244                 req->expires = jiffies + TCP_TIMEOUT_INIT;
1245                 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1246                 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1247         }
1248
1249         sk->data_ready(sk, 0);
1250 exit:
1251         return 0;
1252
1253 dead:
1254         SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1255         tcp_statistics.TcpAttemptFails++;
1256         return -ENOTCONN;
1257 error:
1258         tcp_statistics.TcpAttemptFails++;
1259         goto exit;
1260 }
1261
1262 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1263                                    struct open_request *req,
1264                                    struct dst_entry *dst)
1265 {
1266         struct tcp_opt *newtp;
1267         struct sock *newsk;
1268         int snd_mss;
1269
1270 #ifdef NEW_LISTEN
1271         if (sk->ack_backlog > sk->max_ack_backlog)
1272                 goto exit; /* head drop */
1273 #endif
1274         newsk = sk_alloc(AF_INET, GFP_ATOMIC);
1275         if (!newsk)
1276                 goto exit;
1277 #ifdef NEW_LISTEN
1278         sk->ack_backlog++;
1279 #endif
1280         memcpy(newsk, sk, sizeof(*newsk));
1281
1282         /* Or else we die! -DaveM */
1283         newsk->sklist_next = NULL;
1284
1285         newsk->opt = req->af.v4_req.opt;
1286
1287         skb_queue_head_init(&newsk->write_queue);
1288         skb_queue_head_init(&newsk->receive_queue);
1289         skb_queue_head_init(&newsk->out_of_order_queue);
1290         skb_queue_head_init(&newsk->error_queue);
1291
1292         /* Unused */
1293         newtp = &(newsk->tp_pinfo.af_tcp);
1294         newtp->send_head = NULL;
1295         newtp->retrans_head = NULL;
1296
1297         newtp->pending = 0;
1298
1299         skb_queue_head_init(&newsk->back_log);
1300
1301         newsk->prot->init(newsk);
1302
1303         newtp->snd_cwnd_cnt = 0;
1304         newtp->backoff = 0;
1305         newsk->proc = 0;
1306         newsk->done = 0;
1307         newsk->pair = NULL;
1308         atomic_set(&newsk->wmem_alloc, 0);
1309         atomic_set(&newsk->rmem_alloc, 0);
1310         newsk->localroute = sk->localroute;
1311
1312         newsk->err = 0;
1313         newsk->shutdown = 0;
1314         newsk->ack_backlog = 0;
1315
1316         newtp->fin_seq = req->rcv_isn;
1317         newsk->syn_seq = req->rcv_isn;
1318         newsk->state = TCP_SYN_RECV;
1319         newsk->timeout = 0;
1320
1321         newsk->write_seq = req->snt_isn;
1322
1323         newtp->snd_wnd = ntohs(skb->h.th->window);
1324         newtp->max_window = newtp->snd_wnd;
1325         newtp->snd_wl1 = req->rcv_isn;
1326         newtp->snd_wl2 = newsk->write_seq;
1327         newtp->snd_una = newsk->write_seq++;
1328         newtp->snd_nxt = newsk->write_seq;
1329
1330         newsk->urg_data = 0;
1331         newtp->packets_out = 0;
1332         newtp->retransmits = 0;
1333         newsk->linger=0;
1334         newsk->destroy = 0;
1335         init_timer(&newsk->timer);
1336         newsk->timer.data = (unsigned long) newsk;
1337         newsk->timer.function = &net_timer;
1338
1339         tcp_init_xmit_timers(newsk);
1340
1341         newsk->dummy_th.source = sk->dummy_th.source;
1342         newsk->dummy_th.dest = req->rmt_port;
1343         newsk->sock_readers=0;
1344
1345         newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1;
1346         newtp->rcv_wup = req->rcv_isn + 1;
1347         newsk->copied_seq = req->rcv_isn + 1;
1348
1349         newsk->socket = NULL;
1350
1351         newsk->daddr = req->af.v4_req.rmt_addr;
1352         newsk->saddr = req->af.v4_req.loc_addr;
1353         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1354
1355         /* options / mss / route_cache */
1356         if (dst == NULL) {
1357                 struct rtable *rt;
1358
1359                 if (ip_route_output(&rt,
1360                                     newsk->opt && newsk->opt->srr ?
1361                                     newsk->opt->faddr : newsk->daddr,
1362                                     newsk->saddr, newsk->ip_tos, 0)) {
1363                         sk_free(newsk);
1364                         return NULL;
1365                 }
1366                 dst = &rt->u.dst;
1367         }
1368         newsk->dst_cache = dst;
1369
1370         snd_mss = dst->pmtu;
1371
1372         /* FIXME: is mtu really the same as snd_mss? */
1373         newsk->mtu = snd_mss;
1374         /* FIXME: where does mtu get used after this? */
1375         /* sanity check */
1376         if (newsk->mtu < 64)
1377                 newsk->mtu = 64;
1378
1379         newtp->sack_ok = req->sack_ok;
1380         newtp->tstamp_ok = req->tstamp_ok;
1381         newtp->window_clamp = req->window_clamp;
1382         newtp->rcv_wnd = req->rcv_wnd;
1383         newtp->wscale_ok = req->wscale_ok;
1384         if (newtp->wscale_ok) {
1385                 newtp->snd_wscale = req->snd_wscale;
1386                 newtp->rcv_wscale = req->rcv_wscale;
1387         } else {
1388                 newtp->snd_wscale = newtp->rcv_wscale = 0;
1389                 newtp->window_clamp = min(newtp->window_clamp,65535);
1390         }
1391         if (newtp->tstamp_ok) {
1392                 newtp->ts_recent = req->ts_recent;
1393                 newtp->ts_recent_stamp = jiffies;
1394                 newtp->tcp_header_len = sizeof(struct tcphdr) + 12;     /* FIXME: define constant! */
1395                 newsk->dummy_th.doff += 3;
1396         } else {
1397                 newtp->tcp_header_len = sizeof(struct tcphdr);
1398         }
1399
1400         snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr);
1401         if (sk->user_mss)
1402                 snd_mss = min(snd_mss, sk->user_mss);
1403
1404         /* Make sure our mtu is adjusted for headers. */
1405         newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
1406
1407         tcp_v4_hash(newsk);
1408         add_to_prot_sklist(newsk);
1409         return newsk;
1410
1411 exit:
1412         if (dst)
1413                 dst_release(dst);
1414         return NULL;
1415 }
1416
1417 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1418 {
1419         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1420         struct open_request *req, *prev;
1421
1422         req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1423         if (!req)
1424                 return;
1425         /* Sequence number check required by RFC793 */
1426         if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1))
1427                 return;
1428         tcp_synq_unlink(tp, req, prev);
1429         req->class->destructor(req);
1430         tcp_openreq_free(req);
1431 }
1432
1433 /* Check for embryonic sockets (open_requests) We check packets with
1434  * only the SYN bit set against the open_request queue too: This
1435  * increases connection latency a bit, but is required to detect
1436  * retransmitted SYNs.
1437  */
1438 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1439 {
1440         struct tcphdr *th = skb->h.th;
1441         u32 flg = ((u32 *)th)[3];
1442
1443         /* Check for RST */
1444         if (flg & __constant_htonl(0x00040000)) {
1445                 tcp_v4_rst_req(sk, skb);
1446                 return NULL;
1447         }
1448
1449         /* Check for SYN|ACK */
1450         if (flg & __constant_htonl(0x00120000)) {
1451                 struct open_request *req, *dummy;
1452                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1453
1454                 /* Find possible connection requests. */
1455                 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1456                 if (req) {
1457                         sk = tcp_check_req(sk, skb, req);
1458                 }
1459 #ifdef CONFIG_SYN_COOKIES
1460                  else {
1461                         sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1462                  }
1463 #endif
1464         }
1465         return sk;
1466 }
1467
1468 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1469 {
1470 #ifdef CONFIG_FILTER
1471         if (sk->filter)
1472         {
1473                 if (sk_filter(skb, sk->filter_data, sk->filter))
1474                         return -EPERM;  /* Toss packet */
1475         }
1476 #endif /* CONFIG_FILTER */
1477
1478         skb_set_owner_r(skb, sk);
1479
1480         /*
1481          *      socket locking is here for SMP purposes as backlog rcv
1482          *      is currently called with bh processing disabled.
1483          */
1484         lock_sock(sk);
1485
1486         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1487                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1488                         goto reset;
1489                 release_sock(sk);
1490                 return 0;
1491         }
1492
1493
1494         if (sk->state == TCP_LISTEN) {
1495                 struct sock *nsk;
1496
1497                 nsk = tcp_v4_hnd_req(sk, skb);
1498                 if (!nsk)
1499                         goto discard;
1500                 lock_sock(nsk);
1501                 release_sock(sk);
1502                 sk = nsk;
1503         }
1504
1505         if (tcp_rcv_state_process(sk, skb, skb->h.th,
1506                                   &(IPCB(skb)->opt), skb->len))
1507                 goto reset;
1508         release_sock(sk);
1509         return 0;
1510
1511 reset:
1512         tcp_v4_send_reset(skb);
1513 discard:
1514         kfree_skb(skb, FREE_READ);
1515         /* Be careful here. If this function gets more complicated and
1516          * gcc suffers from register pressure on the x86, sk (in %ebx)
1517          * might be destroyed here. This current version compiles correctly,
1518          * but you have been warned.
1519          */
1520         release_sock(sk);
1521         return 0;
1522 }
1523
1524 /*
1525  *      From tcp_input.c
1526  */
1527
1528 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1529 {
1530         struct tcphdr *th;
1531         struct sock *sk;
1532
1533         if (skb->pkt_type!=PACKET_HOST)
1534                 goto discard_it;
1535
1536         th = skb->h.th;
1537
1538         /* Pull up the IP header. */
1539         __skb_pull(skb, skb->h.raw - skb->data);
1540
1541         /* Count it even if it's bad */
1542         tcp_statistics.TcpInSegs++;
1543
1544         /* Try to use the device checksum if provided. */
1545         switch (skb->ip_summed) {
1546         case CHECKSUM_NONE:
1547                 skb->csum = csum_partial((char *)th, len, 0);
1548         case CHECKSUM_HW:
1549                 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1550                         printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n",
1551                                NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), NIPQUAD(skb->nh.iph->daddr),
1552                                ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len));
1553                         tcp_statistics.TcpInErrs++;
1554                         goto discard_it;
1555                 }
1556         default:
1557                 /* CHECKSUM_UNNECESSARY */
1558         }
1559
1560 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1561         if (IPCB(skb)->redirport)
1562                 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1563                                          skb->nh.iph->daddr, skb->dev,
1564                                          IPCB(skb)->redirport, skb->dev->ifindex);
1565         else
1566 #endif
1567         sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
1568                              skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1569         if (!sk)
1570                 goto no_tcp_socket;
1571         if(!ipsec_sk_policy(sk,skb))
1572                 goto discard_it;
1573
1574         skb->seq = ntohl(th->seq);
1575         skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1576         skb->ack_seq = ntohl(th->ack_seq);
1577
1578         skb->used = 0;
1579
1580         if (!sk->sock_readers)
1581                 return tcp_v4_do_rcv(sk, skb);
1582
1583         __skb_queue_tail(&sk->back_log, skb);
1584         return 0;
1585
1586 no_tcp_socket:
1587         tcp_v4_send_reset(skb);
1588
1589 discard_it:
1590         /* Discard frame. */
1591         kfree_skb(skb, FREE_READ);
1592         return 0;
1593 }
1594
1595 int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
1596 {
1597         return ip_build_header(skb, sk);
1598 }
1599
1600 int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb)
1601 {
1602         struct rtable *rt;
1603         struct iphdr *iph;
1604         struct tcphdr *th;
1605         int size;
1606         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1607
1608         /* Check route */
1609
1610         rt = (struct rtable*)skb->dst;
1611
1612         /* Force route checking if want_rewrite */
1613         if (want_rewrite) {
1614                 int tmp;
1615                 __u32 old_saddr = rt->rt_src;
1616
1617                 /* Query new route */
1618                 tmp = ip_route_connect(&rt, rt->rt_dst, 0,
1619                                         RT_TOS(sk->ip_tos)|(sk->localroute||0),
1620                                         sk->bound_dev_if);
1621
1622                 /* Only useful if different source addrs */
1623                 if (tmp == 0 || rt->rt_src != old_saddr ) {
1624                         dst_release(skb->dst);
1625                         skb->dst = &rt->u.dst;
1626                 } else {
1627                         want_rewrite = 0;
1628                         dst_release(&rt->u.dst);
1629                 }
1630         } else
1631         if (rt->u.dst.obsolete) {
1632                 int err;
1633                 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.oif);
1634                 if (err) {
1635                         sk->err_soft=-err;
1636                         sk->error_report(skb->sk);
1637                         return -1;
1638                 }
1639                 dst_release(skb->dst);
1640                 skb->dst = &rt->u.dst;
1641         }
1642
1643         /* Discard the surplus MAC header. */
1644         skb_pull(skb, skb->nh.raw-skb->data);
1645
1646         iph = skb->nh.iph;
1647         th = skb->h.th;
1648         size = skb->tail - skb->h.raw;
1649
1650         if (want_rewrite) {
1651                 __u32 new_saddr = rt->rt_src;
1652
1653                 /*
1654                  *      Ouch!, this should not happen.
1655                  */
1656                 if (!sk->saddr || !sk->rcv_saddr) {
1657                         printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n",
1658                                ntohl(sk->saddr),
1659                                ntohl(sk->rcv_saddr));
1660                         return 0;
1661                 }
1662
1663                 /*
1664                  *      Maybe whe are in a skb chain loop and socket address has
1665                  *      yet been 'damaged'.
1666                  */
1667
1668                 if (new_saddr != sk->saddr) {
1669                         if (sysctl_ip_dynaddr > 1) {
1670                                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1671                                         NIPQUAD(sk->saddr),
1672                                         NIPQUAD(new_saddr));
1673                         }
1674
1675                         sk->saddr = new_saddr;
1676                         sk->rcv_saddr = new_saddr;
1677                         /* sk->prot->rehash(sk); */
1678                         tcp_v4_rehash(sk);
1679                 }
1680
1681                 if (new_saddr != iph->saddr) {
1682                         if (sysctl_ip_dynaddr > 1) {
1683                                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1684                                         NIPQUAD(iph->saddr),
1685                                         NIPQUAD(new_saddr));
1686                         }
1687
1688                         iph->saddr = new_saddr;
1689                         ip_send_check(iph);
1690                 }
1691
1692         }
1693
1694         return 0;
1695 }
1696
1697 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1698 {
1699         return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1700                              skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1701 }
1702
1703 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1704 {
1705         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1706
1707         sin->sin_family         = AF_INET;
1708         sin->sin_addr.s_addr    = sk->daddr;
1709         sin->sin_port           = sk->dummy_th.dest;
1710 }
1711
1712 struct tcp_func ipv4_specific = {
1713         tcp_v4_build_header,
1714         ip_queue_xmit,
1715         tcp_v4_send_check,
1716         tcp_v4_rebuild_header,
1717         tcp_v4_conn_request,
1718         tcp_v4_syn_recv_sock,
1719         tcp_v4_get_sock,
1720         ip_setsockopt,
1721         ip_getsockopt,
1722         v4_addr2sockaddr,
1723         sizeof(struct sockaddr_in)
1724 };
1725
1726 static int tcp_v4_init_sock(struct sock *sk)
1727 {
1728         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1729
1730         skb_queue_head_init(&sk->out_of_order_queue);
1731         tcp_init_xmit_timers(sk);
1732
1733         tp->srtt  = 0;
1734         tp->rto  = TCP_TIMEOUT_INIT;            /*TCP_WRITE_TIME*/
1735         tp->mdev = TCP_TIMEOUT_INIT;
1736
1737         tp->ato = 0;
1738         tp->iat = (HZ/5) << 3;
1739
1740         /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
1741         /* tp->rcv_wnd = 8192; */
1742         tp->tstamp_ok = 0;
1743         tp->sack_ok = 0;
1744         tp->wscale_ok = 0;
1745         tp->in_mss = 536;
1746         tp->snd_wscale = 0;
1747         tp->sacks = 0;
1748         tp->saw_tstamp = 0;
1749         tp->syn_backlog = 0;
1750
1751         /*
1752          * See draft-stevens-tcpca-spec-01 for discussion of the
1753          * initialization of these values.
1754          */
1755         tp->snd_cwnd = 1;
1756         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1757
1758         sk->priority = 1;
1759         sk->state = TCP_CLOSE;
1760
1761         sk->max_ack_backlog = SOMAXCONN;
1762
1763         sk->mtu = 576;
1764         sk->mss = 536;
1765
1766         /* Speed up by setting some standard state for the dummy_th. */
1767         sk->dummy_th.ack=1;
1768         sk->dummy_th.doff=sizeof(struct tcphdr)>>2;
1769
1770         /* Init SYN queue. */
1771         tcp_synq_init(tp);
1772
1773         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1774
1775         return 0;
1776 }
1777
1778 static int tcp_v4_destroy_sock(struct sock *sk)
1779 {
1780         struct sk_buff *skb;
1781
1782         tcp_clear_xmit_timers(sk);
1783
1784         if (sk->keepopen)
1785                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1786
1787         /* Cleanup up the write buffer. */
1788         while((skb = skb_dequeue(&sk->write_queue)) != NULL)
1789                 kfree_skb(skb, FREE_WRITE);
1790
1791         /* Cleans up our, hopefuly empty, out_of_order_queue. */
1792         while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
1793                 kfree_skb(skb, FREE_READ);
1794
1795         return 0;
1796 }
1797
1798 struct proto tcp_prot = {
1799         (struct sock *)&tcp_prot,       /* sklist_next */
1800         (struct sock *)&tcp_prot,       /* sklist_prev */
1801         tcp_close,                      /* close */
1802         tcp_v4_connect,                 /* connect */
1803         tcp_accept,                     /* accept */
1804         NULL,                           /* retransmit */
1805         tcp_write_wakeup,               /* write_wakeup */
1806         tcp_read_wakeup,                /* read_wakeup */
1807         tcp_poll,                       /* poll */
1808         tcp_ioctl,                      /* ioctl */
1809         tcp_v4_init_sock,               /* init */
1810         tcp_v4_destroy_sock,            /* destroy */
1811         tcp_shutdown,                   /* shutdown */
1812         tcp_setsockopt,                 /* setsockopt */
1813         tcp_getsockopt,                 /* getsockopt */
1814         tcp_v4_sendmsg,                 /* sendmsg */
1815         tcp_recvmsg,                    /* recvmsg */
1816         NULL,                           /* bind */
1817         tcp_v4_do_rcv,                  /* backlog_rcv */
1818         tcp_v4_hash,                    /* hash */
1819         tcp_v4_unhash,                  /* unhash */
1820         tcp_v4_rehash,                  /* rehash */
1821         tcp_good_socknum,               /* good_socknum */
1822         tcp_v4_verify_bind,             /* verify_bind */
1823         128,                            /* max_header */
1824         0,                              /* retransmits */
1825         "TCP",                          /* name */
1826         0,                              /* inuse */
1827         0                               /* highestinuse */
1828 };