net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63
  64 #include <net/net_namespace.h>
  65 #include <net/icmp.h>
  66 #include <net/inet_hashtables.h>
  67 #include <net/tcp.h>
  68 #include <net/transp_v6.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/timewait_sock.h>
  72 #include <net/xfrm.h>
  73 #include <net/netdma.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80
  81 #include <linux/crypto.h>
  82 #include <linux/scatterlist.h>
  83
  84 int sysctl_tcp_tw_reuse __read_mostly;
  85 int sysctl_tcp_low_latency __read_mostly;
  86
  87
  88 #ifdef CONFIG_TCP_MD5SIG
  89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  90                                                    __be32 addr);
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  93 #else
  94 static inline
  95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  96 {
  97         return NULL;
  98 }
  99 #endif
 100
 101 struct inet_hashinfo tcp_hashinfo;
 102
 103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 104 {
 105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 106                                           ip_hdr(skb)->saddr,
 107                                           tcp_hdr(skb)->dest,
 108                                           tcp_hdr(skb)->source);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (twp == NULL || (sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141
 142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 143
 144 /* This will initiate an outgoing connection. */
 145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 146 {
 147         struct inet_sock *inet = inet_sk(sk);
 148         struct tcp_sock *tp = tcp_sk(sk);
 149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 150         struct rtable *rt;
 151         __be32 daddr, nexthop;
 152         int tmp;
 153         int err;
 154
 155         if (addr_len < sizeof(struct sockaddr_in))
 156                 return -EINVAL;
 157
 158         if (usin->sin_family != AF_INET)
 159                 return -EAFNOSUPPORT;
 160
 161         nexthop = daddr = usin->sin_addr.s_addr;
 162         if (inet->opt && inet->opt->srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet->opt->faddr;
 166         }
 167
 168         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 170                                IPPROTO_TCP,
 171                                inet->sport, usin->sin_port, sk, 1);
 172         if (tmp < 0) {
 173                 if (tmp == -ENETUNREACH)
 174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 175                 return tmp;
 176         }
 177
 178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 179                 ip_rt_put(rt);
 180                 return -ENETUNREACH;
 181         }
 182
 183         if (!inet->opt || !inet->opt->srr)
 184                 daddr = rt->rt_dst;
 185
 186         if (!inet->saddr)
 187                 inet->saddr = rt->rt_src;
 188         inet->rcv_saddr = inet->saddr;
 189
 190         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 191                 /* Reset inherited state */
 192                 tp->rx_opt.ts_recent       = 0;
 193                 tp->rx_opt.ts_recent_stamp = 0;
 194                 tp->write_seq              = 0;
 195         }
 196
 197         if (tcp_death_row.sysctl_tw_recycle &&
 198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 199                 struct inet_peer *peer = rt_get_peer(rt);
 200                 /*
 201                  * VJ's idea. We save last timestamp seen from
 202                  * the destination in peer table, when entering state
 203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 204                  * when trying new connection.
 205                  */
 206                 if (peer != NULL &&
 207                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 209                         tp->rx_opt.ts_recent = peer->tcp_ts;
 210                 }
 211         }
 212
 213         inet->dport = usin->sin_port;
 214         inet->daddr = daddr;
 215
 216         inet_csk(sk)->icsk_ext_hdr_len = 0;
 217         if (inet->opt)
 218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 219
 220         tp->rx_opt.mss_clamp = 536;
 221
 222         /* Socket identity is still unknown (sport may be zero).
 223          * However we set state to SYN-SENT and not releasing socket
 224          * lock select source port, enter ourselves into the hash tables and
 225          * complete initialization after this.
 226          */
 227         tcp_set_state(sk, TCP_SYN_SENT);
 228         err = inet_hash_connect(&tcp_death_row, sk);
 229         if (err)
 230                 goto failure;
 231
 232         err = ip_route_newports(&rt, IPPROTO_TCP,
 233                                 inet->sport, inet->dport, sk);
 234         if (err)
 235                 goto failure;
 236
 237         /* OK, now commit destination to socket.  */
 238         sk->sk_gso_type = SKB_GSO_TCPV4;
 239         sk_setup_caps(sk, &rt->u.dst);
 240
 241         if (!tp->write_seq)
 242                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 243                                                            inet->daddr,
 244                                                            inet->sport,
 245                                                            usin->sin_port);
 246
 247         inet->id = tp->write_seq ^ jiffies;
 248
 249         err = tcp_connect(sk);
 250         rt = NULL;
 251         if (err)
 252                 goto failure;
 253
 254         return 0;
 255
 256 failure:
 257         /*
 258          * This unhashes the socket and releases the local port,
 259          * if necessary.
 260          */
 261         tcp_set_state(sk, TCP_CLOSE);
 262         ip_rt_put(rt);
 263         sk->sk_route_caps = 0;
 264         inet->dport = 0;
 265         return err;
 266 }
 267
 268 /*
 269  * This routine does path mtu discovery as defined in RFC1191.
 270  */
 271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275
 276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 277          * send out by Linux are always <576bytes so they should go through
 278          * unfragmented).
 279          */
 280         if (sk->sk_state == TCP_LISTEN)
 281                 return;
 282
 283         /* We don't check in the destentry if pmtu discovery is forbidden
 284          * on this route. We just assume that no packet_to_big packets
 285          * are send back when pmtu discovery is not active.
 286          * There is a small race when the user changes this flag in the
 287          * route, but I think that's acceptable.
 288          */
 289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 290                 return;
 291
 292         dst->ops->update_pmtu(dst, mtu);
 293
 294         /* Something is about to be wrong... Remember soft error
 295          * for the case, if this connection will not able to recover.
 296          */
 297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 298                 sk->sk_err_soft = EMSGSIZE;
 299
 300         mtu = dst_mtu(dst);
 301
 302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 304                 tcp_sync_mss(sk, mtu);
 305
 306                 /* Resend the TCP packet because it's
 307                  * clear that the old packet has been
 308                  * dropped. This is the new "fast" path mtu
 309                  * discovery.
 310                  */
 311                 tcp_simple_retransmit(sk);
 312         } /* else let the usual retransmit timer handle it */
 313 }
 314
 315 /*
 316  * This routine is called by the ICMP module when it gets some
 317  * sort of error condition.  If err < 0 then the socket should
 318  * be closed and the error returned to the user.  If err > 0
 319  * it's just the icmp type << 8 | icmp code.  After adjustment
 320  * header points to the first 8 bytes of the tcp header.  We need
 321  * to find the appropriate port.
 322  *
 323  * The locking strategy used here is very "optimistic". When
 324  * someone else accesses the socket the ICMP is just dropped
 325  * and for some paths there is no check at all.
 326  * A more general error queue to queue errors for later handling
 327  * is probably better.
 328  *
 329  */
 330
 331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 332 {
 333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 335         struct inet_connection_sock *icsk;
 336         struct tcp_sock *tp;
 337         struct inet_sock *inet;
 338         const int type = icmp_hdr(icmp_skb)->type;
 339         const int code = icmp_hdr(icmp_skb)->code;
 340         struct sock *sk;
 341         struct sk_buff *skb;
 342         __u32 seq;
 343         __u32 remaining;
 344         int err;
 345         struct net *net = dev_net(icmp_skb->dev);
 346
 347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                 return;
 350         }
 351
 352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                         iph->saddr, th->source, inet_iif(icmp_skb));
 354         if (!sk) {
 355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                 return;
 357         }
 358         if (sk->sk_state == TCP_TIME_WAIT) {
 359                 inet_twsk_put(inet_twsk(sk));
 360                 return;
 361         }
 362
 363         bh_lock_sock(sk);
 364         /* If too many ICMPs get dropped on busy
 365          * servers this needs to be solved differently.
 366          */
 367         if (sock_owned_by_user(sk))
 368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369
 370         if (sk->sk_state == TCP_CLOSE)
 371                 goto out;
 372
 373         icsk = inet_csk(sk);
 374         tp = tcp_sk(sk);
 375         seq = ntohl(th->seq);
 376         if (sk->sk_state != TCP_LISTEN &&
 377             !between(seq, tp->snd_una, tp->snd_nxt)) {
 378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 379                 goto out;
 380         }
 381
 382         switch (type) {
 383         case ICMP_SOURCE_QUENCH:
 384                 /* Just silently ignore these. */
 385                 goto out;
 386         case ICMP_PARAMETERPROB:
 387                 err = EPROTO;
 388                 break;
 389         case ICMP_DEST_UNREACH:
 390                 if (code > NR_ICMP_UNREACH)
 391                         goto out;
 392
 393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 394                         if (!sock_owned_by_user(sk))
 395                                 do_pmtu_discovery(sk, iph, info);
 396                         goto out;
 397                 }
 398
 399                 err = icmp_err_convert[code].errno;
 400                 /* check if icmp_skb allows revert of backoff
 401                  * (see draft-zimmermann-tcp-lcd) */
 402                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 403                         break;
 404                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 405                     !icsk->icsk_backoff)
 406                         break;
 407
 408                 icsk->icsk_backoff--;
 409                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 410                                          icsk->icsk_backoff;
 411                 tcp_bound_rto(sk);
 412
 413                 skb = tcp_write_queue_head(sk);
 414                 BUG_ON(!skb);
 415
 416                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 417                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 418
 419                 if (remaining) {
 420                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 421                                                   remaining, TCP_RTO_MAX);
 422                 } else if (sock_owned_by_user(sk)) {
 423                         /* RTO revert clocked out retransmission,
 424                          * but socket is locked. Will defer. */
 425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 426                                                   HZ/20, TCP_RTO_MAX);
 427                 } else {
 428                         /* RTO revert clocked out retransmission.
 429                          * Will retransmit now */
 430                         tcp_retransmit_timer(sk);
 431                 }
 432
 433                 break;
 434         case ICMP_TIME_EXCEEDED:
 435                 err = EHOSTUNREACH;
 436                 break;
 437         default:
 438                 goto out;
 439         }
 440
 441         switch (sk->sk_state) {
 442                 struct request_sock *req, **prev;
 443         case TCP_LISTEN:
 444                 if (sock_owned_by_user(sk))
 445                         goto out;
 446
 447                 req = inet_csk_search_req(sk, &prev, th->dest,
 448                                           iph->daddr, iph->saddr);
 449                 if (!req)
 450                         goto out;
 451
 452                 /* ICMPs are not backlogged, hence we cannot get
 453                    an established socket here.
 454                  */
 455                 WARN_ON(req->sk);
 456
 457                 if (seq != tcp_rsk(req)->snt_isn) {
 458                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 459                         goto out;
 460                 }
 461
 462                 /*
 463                  * Still in SYN_RECV, just remove it silently.
 464                  * There is no good way to pass the error to the newly
 465                  * created socket, and POSIX does not want network
 466                  * errors returned from accept().
 467                  */
 468                 inet_csk_reqsk_queue_drop(sk, req, prev);
 469                 goto out;
 470
 471         case TCP_SYN_SENT:
 472         case TCP_SYN_RECV:  /* Cannot happen.
 473                                It can f.e. if SYNs crossed.
 474                              */
 475                 if (!sock_owned_by_user(sk)) {
 476                         sk->sk_err = err;
 477
 478                         sk->sk_error_report(sk);
 479
 480                         tcp_done(sk);
 481                 } else {
 482                         sk->sk_err_soft = err;
 483                 }
 484                 goto out;
 485         }
 486
 487         /* If we've already connected we will keep trying
 488          * until we time out, or the user gives up.
 489          *
 490          * rfc1122 4.2.3.9 allows to consider as hard errors
 491          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 492          * but it is obsoleted by pmtu discovery).
 493          *
 494          * Note, that in modern internet, where routing is unreliable
 495          * and in each dark corner broken firewalls sit, sending random
 496          * errors ordered by their masters even this two messages finally lose
 497          * their original sense (even Linux sends invalid PORT_UNREACHs)
 498          *
 499          * Now we are in compliance with RFCs.
 500          *                                                      --ANK (980905)
 501          */
 502
 503         inet = inet_sk(sk);
 504         if (!sock_owned_by_user(sk) && inet->recverr) {
 505                 sk->sk_err = err;
 506                 sk->sk_error_report(sk);
 507         } else  { /* Only an error on timeout */
 508                 sk->sk_err_soft = err;
 509         }
 510
 511 out:
 512         bh_unlock_sock(sk);
 513         sock_put(sk);
 514 }
 515
 516 /* This routine computes an IPv4 TCP checksum. */
 517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 518 {
 519         struct inet_sock *inet = inet_sk(sk);
 520         struct tcphdr *th = tcp_hdr(skb);
 521
 522         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 523                 th->check = ~tcp_v4_check(len, inet->saddr,
 524                                           inet->daddr, 0);
 525                 skb->csum_start = skb_transport_header(skb) - skb->head;
 526                 skb->csum_offset = offsetof(struct tcphdr, check);
 527         } else {
 528                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 529                                          csum_partial(th,
 530                                                       th->doff << 2,
 531                                                       skb->csum));
 532         }
 533 }
 534
 535 int tcp_v4_gso_send_check(struct sk_buff *skb)
 536 {
 537         const struct iphdr *iph;
 538         struct tcphdr *th;
 539
 540         if (!pskb_may_pull(skb, sizeof(*th)))
 541                 return -EINVAL;
 542
 543         iph = ip_hdr(skb);
 544         th = tcp_hdr(skb);
 545
 546         th->check = 0;
 547         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 548         skb->csum_start = skb_transport_header(skb) - skb->head;
 549         skb->csum_offset = offsetof(struct tcphdr, check);
 550         skb->ip_summed = CHECKSUM_PARTIAL;
 551         return 0;
 552 }
 553
 554 /*
 555  *      This routine will send an RST to the other tcp.
 556  *
 557  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 558  *                    for reset.
 559  *      Answer: if a packet caused RST, it is not for a socket
 560  *              existing in our system, if it is matched to a socket,
 561  *              it is just duplicate segment or bug in other side's TCP.
 562  *              So that we build reply only basing on parameters
 563  *              arrived with segment.
 564  *      Exception: precedence violation. We do not implement it in any case.
 565  */
 566
 567 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 568 {
 569         struct tcphdr *th = tcp_hdr(skb);
 570         struct {
 571                 struct tcphdr th;
 572 #ifdef CONFIG_TCP_MD5SIG
 573                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 574 #endif
 575         } rep;
 576         struct ip_reply_arg arg;
 577 #ifdef CONFIG_TCP_MD5SIG
 578         struct tcp_md5sig_key *key;
 579 #endif
 580         struct net *net;
 581
 582         /* Never send a reset in response to a reset. */
 583         if (th->rst)
 584                 return;
 585
 586         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 587                 return;
 588
 589         /* Swap the send and the receive. */
 590         memset(&rep, 0, sizeof(rep));
 591         rep.th.dest   = th->source;
 592         rep.th.source = th->dest;
 593         rep.th.doff   = sizeof(struct tcphdr) / 4;
 594         rep.th.rst    = 1;
 595
 596         if (th->ack) {
 597                 rep.th.seq = th->ack_seq;
 598         } else {
 599                 rep.th.ack = 1;
 600                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 601                                        skb->len - (th->doff << 2));
 602         }
 603
 604         memset(&arg, 0, sizeof(arg));
 605         arg.iov[0].iov_base = (unsigned char *)&rep;
 606         arg.iov[0].iov_len  = sizeof(rep.th);
 607
 608 #ifdef CONFIG_TCP_MD5SIG
 609         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 610         if (key) {
 611                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 612                                    (TCPOPT_NOP << 16) |
 613                                    (TCPOPT_MD5SIG << 8) |
 614                                    TCPOLEN_MD5SIG);
 615                 /* Update length and the length the header thinks exists */
 616                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 617                 rep.th.doff = arg.iov[0].iov_len / 4;
 618
 619                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 620                                      key, ip_hdr(skb)->saddr,
 621                                      ip_hdr(skb)->daddr, &rep.th);
 622         }
 623 #endif
 624         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 625                                       ip_hdr(skb)->saddr, /* XXX */
 626                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 627         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 628         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 629
 630         net = dev_net(skb_dst(skb)->dev);
 631         ip_send_reply(net->ipv4.tcp_sock, skb,
 632                       &arg, arg.iov[0].iov_len);
 633
 634         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 635         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 636 }
 637
 638 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 639    outside socket context is ugly, certainly. What can I do?
 640  */
 641
 642 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 643                             u32 win, u32 ts, int oif,
 644                             struct tcp_md5sig_key *key,
 645                             int reply_flags)
 646 {
 647         struct tcphdr *th = tcp_hdr(skb);
 648         struct {
 649                 struct tcphdr th;
 650                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 651 #ifdef CONFIG_TCP_MD5SIG
 652                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 653 #endif
 654                         ];
 655         } rep;
 656         struct ip_reply_arg arg;
 657         struct net *net = dev_net(skb_dst(skb)->dev);
 658
 659         memset(&rep.th, 0, sizeof(struct tcphdr));
 660         memset(&arg, 0, sizeof(arg));
 661
 662         arg.iov[0].iov_base = (unsigned char *)&rep;
 663         arg.iov[0].iov_len  = sizeof(rep.th);
 664         if (ts) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 666                                    (TCPOPT_TIMESTAMP << 8) |
 667                                    TCPOLEN_TIMESTAMP);
 668                 rep.opt[1] = htonl(tcp_time_stamp);
 669                 rep.opt[2] = htonl(ts);
 670                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 671         }
 672
 673         /* Swap the send and the receive. */
 674         rep.th.dest    = th->source;
 675         rep.th.source  = th->dest;
 676         rep.th.doff    = arg.iov[0].iov_len / 4;
 677         rep.th.seq     = htonl(seq);
 678         rep.th.ack_seq = htonl(ack);
 679         rep.th.ack     = 1;
 680         rep.th.window  = htons(win);
 681
 682 #ifdef CONFIG_TCP_MD5SIG
 683         if (key) {
 684                 int offset = (ts) ? 3 : 0;
 685
 686                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 687                                           (TCPOPT_NOP << 16) |
 688                                           (TCPOPT_MD5SIG << 8) |
 689                                           TCPOLEN_MD5SIG);
 690                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 691                 rep.th.doff = arg.iov[0].iov_len/4;
 692
 693                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 694                                     key, ip_hdr(skb)->saddr,
 695                                     ip_hdr(skb)->daddr, &rep.th);
 696         }
 697 #endif
 698         arg.flags = reply_flags;
 699         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 700                                       ip_hdr(skb)->saddr, /* XXX */
 701                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 702         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 703         if (oif)
 704                 arg.bound_dev_if = oif;
 705
 706         ip_send_reply(net->ipv4.tcp_sock, skb,
 707                       &arg, arg.iov[0].iov_len);
 708
 709         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 710 }
 711
 712 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 713 {
 714         struct inet_timewait_sock *tw = inet_twsk(sk);
 715         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 716
 717         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 718                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 719                         tcptw->tw_ts_recent,
 720                         tw->tw_bound_dev_if,
 721                         tcp_twsk_md5_key(tcptw),
 722                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 723                         );
 724
 725         inet_twsk_put(tw);
 726 }
 727
 728 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 729                                   struct request_sock *req)
 730 {
 731         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 732                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 733                         req->ts_recent,
 734                         0,
 735                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 736                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 737 }
 738
 739 /*
 740  *      Send a SYN-ACK after having received a SYN.
 741  *      This still operates on a request_sock only, not on a big
 742  *      socket.
 743  */
 744 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 745                                 struct dst_entry *dst)
 746 {
 747         const struct inet_request_sock *ireq = inet_rsk(req);
 748         int err = -1;
 749         struct sk_buff * skb;
 750
 751         /* First, grab a route. */
 752         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 753                 return -1;
 754
 755         skb = tcp_make_synack(sk, dst, req);
 756
 757         if (skb) {
 758                 struct tcphdr *th = tcp_hdr(skb);
 759
 760                 th->check = tcp_v4_check(skb->len,
 761                                          ireq->loc_addr,
 762                                          ireq->rmt_addr,
 763                                          csum_partial(th, skb->len,
 764                                                       skb->csum));
 765
 766                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 767                                             ireq->rmt_addr,
 768                                             ireq->opt);
 769                 err = net_xmit_eval(err);
 770         }
 771
 772         dst_release(dst);
 773         return err;
 774 }
 775
 776 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 777 {
 778         return __tcp_v4_send_synack(sk, req, NULL);
 779 }
 780
 781 /*
 782  *      IPv4 request_sock destructor.
 783  */
 784 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 785 {
 786         kfree(inet_rsk(req)->opt);
 787 }
 788
 789 #ifdef CONFIG_SYN_COOKIES
 790 static void syn_flood_warning(struct sk_buff *skb)
 791 {
 792         static unsigned long warntime;
 793
 794         if (time_after(jiffies, (warntime + HZ * 60))) {
 795                 warntime = jiffies;
 796                 printk(KERN_INFO
 797                        "possible SYN flooding on port %d. Sending cookies.\n",
 798                        ntohs(tcp_hdr(skb)->dest));
 799         }
 800 }
 801 #endif
 802
 803 /*
 804  * Save and compile IPv4 options into the request_sock if needed.
 805  */
 806 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 807                                               struct sk_buff *skb)
 808 {
 809         struct ip_options *opt = &(IPCB(skb)->opt);
 810         struct ip_options *dopt = NULL;
 811
 812         if (opt && opt->optlen) {
 813                 int opt_size = optlength(opt);
 814                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 815                 if (dopt) {
 816                         if (ip_options_echo(dopt, skb)) {
 817                                 kfree(dopt);
 818                                 dopt = NULL;
 819                         }
 820                 }
 821         }
 822         return dopt;
 823 }
 824
 825 #ifdef CONFIG_TCP_MD5SIG
 826 /*
 827  * RFC2385 MD5 checksumming requires a mapping of
 828  * IP address->MD5 Key.
 829  * We need to maintain these in the sk structure.
 830  */
 831
 832 /* Find the Key structure for an address.  */
 833 static struct tcp_md5sig_key *
 834                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 835 {
 836         struct tcp_sock *tp = tcp_sk(sk);
 837         int i;
 838
 839         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 840                 return NULL;
 841         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 842                 if (tp->md5sig_info->keys4[i].addr == addr)
 843                         return &tp->md5sig_info->keys4[i].base;
 844         }
 845         return NULL;
 846 }
 847
 848 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 849                                          struct sock *addr_sk)
 850 {
 851         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 852 }
 853
 854 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 855
 856 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 857                                                       struct request_sock *req)
 858 {
 859         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 860 }
 861
 862 /* This can be called on a newly created socket, from other files */
 863 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 864                       u8 *newkey, u8 newkeylen)
 865 {
 866         /* Add Key to the list */
 867         struct tcp_md5sig_key *key;
 868         struct tcp_sock *tp = tcp_sk(sk);
 869         struct tcp4_md5sig_key *keys;
 870
 871         key = tcp_v4_md5_do_lookup(sk, addr);
 872         if (key) {
 873                 /* Pre-existing entry - just update that one. */
 874                 kfree(key->key);
 875                 key->key = newkey;
 876                 key->keylen = newkeylen;
 877         } else {
 878                 struct tcp_md5sig_info *md5sig;
 879
 880                 if (!tp->md5sig_info) {
 881                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 882                                                   GFP_ATOMIC);
 883                         if (!tp->md5sig_info) {
 884                                 kfree(newkey);
 885                                 return -ENOMEM;
 886                         }
 887                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 888                 }
 889                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 890                         kfree(newkey);
 891                         return -ENOMEM;
 892                 }
 893                 md5sig = tp->md5sig_info;
 894
 895                 if (md5sig->alloced4 == md5sig->entries4) {
 896                         keys = kmalloc((sizeof(*keys) *
 897                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 898                         if (!keys) {
 899                                 kfree(newkey);
 900                                 tcp_free_md5sig_pool();
 901                                 return -ENOMEM;
 902                         }
 903
 904                         if (md5sig->entries4)
 905                                 memcpy(keys, md5sig->keys4,
 906                                        sizeof(*keys) * md5sig->entries4);
 907
 908                         /* Free old key list, and reference new one */
 909                         kfree(md5sig->keys4);
 910                         md5sig->keys4 = keys;
 911                         md5sig->alloced4++;
 912                 }
 913                 md5sig->entries4++;
 914                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 915                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 916                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 917         }
 918         return 0;
 919 }
 920
 921 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 922
 923 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 924                                u8 *newkey, u8 newkeylen)
 925 {
 926         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 927                                  newkey, newkeylen);
 928 }
 929
 930 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 931 {
 932         struct tcp_sock *tp = tcp_sk(sk);
 933         int i;
 934
 935         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 936                 if (tp->md5sig_info->keys4[i].addr == addr) {
 937                         /* Free the key */
 938                         kfree(tp->md5sig_info->keys4[i].base.key);
 939                         tp->md5sig_info->entries4--;
 940
 941                         if (tp->md5sig_info->entries4 == 0) {
 942                                 kfree(tp->md5sig_info->keys4);
 943                                 tp->md5sig_info->keys4 = NULL;
 944                                 tp->md5sig_info->alloced4 = 0;
 945                         } else if (tp->md5sig_info->entries4 != i) {
 946                                 /* Need to do some manipulation */
 947                                 memmove(&tp->md5sig_info->keys4[i],
 948                                         &tp->md5sig_info->keys4[i+1],
 949                                         (tp->md5sig_info->entries4 - i) *
 950                                          sizeof(struct tcp4_md5sig_key));
 951                         }
 952                         tcp_free_md5sig_pool();
 953                         return 0;
 954                 }
 955         }
 956         return -ENOENT;
 957 }
 958
 959 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 960
 961 static void tcp_v4_clear_md5_list(struct sock *sk)
 962 {
 963         struct tcp_sock *tp = tcp_sk(sk);
 964
 965         /* Free each key, then the set of key keys,
 966          * the crypto element, and then decrement our
 967          * hold on the last resort crypto.
 968          */
 969         if (tp->md5sig_info->entries4) {
 970                 int i;
 971                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 972                         kfree(tp->md5sig_info->keys4[i].base.key);
 973                 tp->md5sig_info->entries4 = 0;
 974                 tcp_free_md5sig_pool();
 975         }
 976         if (tp->md5sig_info->keys4) {
 977                 kfree(tp->md5sig_info->keys4);
 978                 tp->md5sig_info->keys4 = NULL;
 979                 tp->md5sig_info->alloced4  = 0;
 980         }
 981 }
 982
 983 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 984                                  int optlen)
 985 {
 986         struct tcp_md5sig cmd;
 987         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 988         u8 *newkey;
 989
 990         if (optlen < sizeof(cmd))
 991                 return -EINVAL;
 992
 993         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 994                 return -EFAULT;
 995
 996         if (sin->sin_family != AF_INET)
 997                 return -EINVAL;
 998
 999         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1000                 if (!tcp_sk(sk)->md5sig_info)
1001                         return -ENOENT;
1002                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1003         }
1004
1005         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1006                 return -EINVAL;
1007
1008         if (!tcp_sk(sk)->md5sig_info) {
1009                 struct tcp_sock *tp = tcp_sk(sk);
1010                 struct tcp_md5sig_info *p;
1011
1012                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1013                 if (!p)
1014                         return -EINVAL;
1015
1016                 tp->md5sig_info = p;
1017                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1018         }
1019
1020         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1021         if (!newkey)
1022                 return -ENOMEM;
1023         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1024                                  newkey, cmd.tcpm_keylen);
1025 }
1026
1027 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1028                                         __be32 daddr, __be32 saddr, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032
1033         bp = &hp->md5_blk.ip4;
1034
1035         /*
1036          * 1. the TCP pseudo-header (in the order: source IP address,
1037          * destination IP address, zero-padded protocol number, and
1038          * segment length)
1039          */
1040         bp->saddr = saddr;
1041         bp->daddr = daddr;
1042         bp->pad = 0;
1043         bp->protocol = IPPROTO_TCP;
1044         bp->len = cpu_to_be16(nbytes);
1045
1046         sg_init_one(&sg, bp, sizeof(*bp));
1047         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1048 }
1049
1050 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1051                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1052 {
1053         struct tcp_md5sig_pool *hp;
1054         struct hash_desc *desc;
1055
1056         hp = tcp_get_md5sig_pool();
1057         if (!hp)
1058                 goto clear_hash_noput;
1059         desc = &hp->md5_desc;
1060
1061         if (crypto_hash_init(desc))
1062                 goto clear_hash;
1063         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1064                 goto clear_hash;
1065         if (tcp_md5_hash_header(hp, th))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         if (crypto_hash_final(desc, md5_hash))
1070                 goto clear_hash;
1071
1072         tcp_put_md5sig_pool();
1073         return 0;
1074
1075 clear_hash:
1076         tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078         memset(md5_hash, 0, 16);
1079         return 1;
1080 }
1081
1082 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1083                         struct sock *sk, struct request_sock *req,
1084                         struct sk_buff *skb)
1085 {
1086         struct tcp_md5sig_pool *hp;
1087         struct hash_desc *desc;
1088         struct tcphdr *th = tcp_hdr(skb);
1089         __be32 saddr, daddr;
1090
1091         if (sk) {
1092                 saddr = inet_sk(sk)->saddr;
1093                 daddr = inet_sk(sk)->daddr;
1094         } else if (req) {
1095                 saddr = inet_rsk(req)->loc_addr;
1096                 daddr = inet_rsk(req)->rmt_addr;
1097         } else {
1098                 const struct iphdr *iph = ip_hdr(skb);
1099                 saddr = iph->saddr;
1100                 daddr = iph->daddr;
1101         }
1102
1103         hp = tcp_get_md5sig_pool();
1104         if (!hp)
1105                 goto clear_hash_noput;
1106         desc = &hp->md5_desc;
1107
1108         if (crypto_hash_init(desc))
1109                 goto clear_hash;
1110
1111         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_header(hp, th))
1114                 goto clear_hash;
1115         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1116                 goto clear_hash;
1117         if (tcp_md5_hash_key(hp, key))
1118                 goto clear_hash;
1119         if (crypto_hash_final(desc, md5_hash))
1120                 goto clear_hash;
1121
1122         tcp_put_md5sig_pool();
1123         return 0;
1124
1125 clear_hash:
1126         tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128         memset(md5_hash, 0, 16);
1129         return 1;
1130 }
1131
1132 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1133
1134 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1135 {
1136         /*
1137          * This gets called for each TCP segment that arrives
1138          * so we want to be efficient.
1139          * We have 3 drop cases:
1140          * o No MD5 hash and one expected.
1141          * o MD5 hash and we're not expecting one.
1142          * o MD5 hash and its wrong.
1143          */
1144         __u8 *hash_location = NULL;
1145         struct tcp_md5sig_key *hash_expected;
1146         const struct iphdr *iph = ip_hdr(skb);
1147         struct tcphdr *th = tcp_hdr(skb);
1148         int genhash;
1149         unsigned char newhash[16];
1150
1151         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1152         hash_location = tcp_parse_md5sig_option(th);
1153
1154         /* We've parsed the options - do we have a hash? */
1155         if (!hash_expected && !hash_location)
1156                 return 0;
1157
1158         if (hash_expected && !hash_location) {
1159                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1160                 return 1;
1161         }
1162
1163         if (!hash_expected && hash_location) {
1164                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1165                 return 1;
1166         }
1167
1168         /* Okay, so this is hash_expected and hash_location -
1169          * so we need to calculate the checksum.
1170          */
1171         genhash = tcp_v4_md5_hash_skb(newhash,
1172                                       hash_expected,
1173                                       NULL, NULL, skb);
1174
1175         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1176                 if (net_ratelimit()) {
1177                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1178                                &iph->saddr, ntohs(th->source),
1179                                &iph->daddr, ntohs(th->dest),
1180                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1181                 }
1182                 return 1;
1183         }
1184         return 0;
1185 }
1186
1187 #endif
1188
1189 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1190         .family         =       PF_INET,
1191         .obj_size       =       sizeof(struct tcp_request_sock),
1192         .rtx_syn_ack    =       tcp_v4_send_synack,
1193         .send_ack       =       tcp_v4_reqsk_send_ack,
1194         .destructor     =       tcp_v4_reqsk_destructor,
1195         .send_reset     =       tcp_v4_send_reset,
1196 };
1197
1198 #ifdef CONFIG_TCP_MD5SIG
1199 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1200         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1201         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1202 };
1203 #endif
1204
1205 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1206         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1207         .twsk_unique    = tcp_twsk_unique,
1208         .twsk_destructor= tcp_twsk_destructor,
1209 };
1210
1211 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1212 {
1213         struct inet_request_sock *ireq;
1214         struct tcp_options_received tmp_opt;
1215         struct request_sock *req;
1216         __be32 saddr = ip_hdr(skb)->saddr;
1217         __be32 daddr = ip_hdr(skb)->daddr;
1218         __u32 isn = TCP_SKB_CB(skb)->when;
1219         struct dst_entry *dst = NULL;
1220 #ifdef CONFIG_SYN_COOKIES
1221         int want_cookie = 0;
1222 #else
1223 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1224 #endif
1225
1226         /* Never answer to SYNs send to broadcast or multicast */
1227         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1228                 goto drop;
1229
1230         /* TW buckets are converted to open requests without
1231          * limitations, they conserve resources and peer is
1232          * evidently real one.
1233          */
1234         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1235 #ifdef CONFIG_SYN_COOKIES
1236                 if (sysctl_tcp_syncookies) {
1237                         want_cookie = 1;
1238                 } else
1239 #endif
1240                 goto drop;
1241         }
1242
1243         /* Accept backlog is full. If we have already queued enough
1244          * of warm entries in syn queue, drop request. It is better than
1245          * clogging syn queue with openreqs with exponentially increasing
1246          * timeout.
1247          */
1248         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1249                 goto drop;
1250
1251         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1252         if (!req)
1253                 goto drop;
1254
1255 #ifdef CONFIG_TCP_MD5SIG
1256         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1257 #endif
1258
1259         tcp_clear_options(&tmp_opt);
1260         tmp_opt.mss_clamp = 536;
1261         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1262
1263         tcp_parse_options(skb, &tmp_opt, 0);
1264
1265         if (want_cookie && !tmp_opt.saw_tstamp)
1266                 tcp_clear_options(&tmp_opt);
1267
1268         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1269
1270         tcp_openreq_init(req, &tmp_opt, skb);
1271
1272         ireq = inet_rsk(req);
1273         ireq->loc_addr = daddr;
1274         ireq->rmt_addr = saddr;
1275         ireq->no_srccheck = inet_sk(sk)->transparent;
1276         ireq->opt = tcp_v4_save_options(sk, skb);
1277
1278         if (security_inet_conn_request(sk, skb, req))
1279                 goto drop_and_free;
1280
1281         if (!want_cookie)
1282                 TCP_ECN_create_request(req, tcp_hdr(skb));
1283
1284         if (want_cookie) {
1285 #ifdef CONFIG_SYN_COOKIES
1286                 syn_flood_warning(skb);
1287                 req->cookie_ts = tmp_opt.tstamp_ok;
1288 #endif
1289                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1290         } else if (!isn) {
1291                 struct inet_peer *peer = NULL;
1292
1293                 /* VJ's idea. We save last timestamp seen
1294                  * from the destination in peer table, when entering
1295                  * state TIME-WAIT, and check against it before
1296                  * accepting new connection request.
1297                  *
1298                  * If "isn" is not zero, this request hit alive
1299                  * timewait bucket, so that all the necessary checks
1300                  * are made in the function processing timewait state.
1301                  */
1302                 if (tmp_opt.saw_tstamp &&
1303                     tcp_death_row.sysctl_tw_recycle &&
1304                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1305                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1306                     peer->v4daddr == saddr) {
1307                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1308                             (s32)(peer->tcp_ts - req->ts_recent) >
1309                                                         TCP_PAWS_WINDOW) {
1310                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1311                                 goto drop_and_release;
1312                         }
1313                 }
1314                 /* Kill the following clause, if you dislike this way. */
1315                 else if (!sysctl_tcp_syncookies &&
1316                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1317                           (sysctl_max_syn_backlog >> 2)) &&
1318                          (!peer || !peer->tcp_ts_stamp) &&
1319                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1320                         /* Without syncookies last quarter of
1321                          * backlog is filled with destinations,
1322                          * proven to be alive.
1323                          * It means that we continue to communicate
1324                          * to destinations, already remembered
1325                          * to the moment of synflood.
1326                          */
1327                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1328                                        &saddr, ntohs(tcp_hdr(skb)->source));
1329                         goto drop_and_release;
1330                 }
1331
1332                 isn = tcp_v4_init_sequence(skb);
1333         }
1334         tcp_rsk(req)->snt_isn = isn;
1335
1336         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1337                 goto drop_and_free;
1338
1339         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1340         return 0;
1341
1342 drop_and_release:
1343         dst_release(dst);
1344 drop_and_free:
1345         reqsk_free(req);
1346 drop:
1347         return 0;
1348 }
1349
1350
1351 /*
1352  * The three way handshake has completed - we got a valid synack -
1353  * now create the new socket.
1354  */
1355 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1356                                   struct request_sock *req,
1357                                   struct dst_entry *dst)
1358 {
1359         struct inet_request_sock *ireq;
1360         struct inet_sock *newinet;
1361         struct tcp_sock *newtp;
1362         struct sock *newsk;
1363 #ifdef CONFIG_TCP_MD5SIG
1364         struct tcp_md5sig_key *key;
1365 #endif
1366
1367         if (sk_acceptq_is_full(sk))
1368                 goto exit_overflow;
1369
1370         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1371                 goto exit;
1372
1373         newsk = tcp_create_openreq_child(sk, req, skb);
1374         if (!newsk)
1375                 goto exit;
1376
1377         newsk->sk_gso_type = SKB_GSO_TCPV4;
1378         sk_setup_caps(newsk, dst);
1379
1380         newtp                 = tcp_sk(newsk);
1381         newinet               = inet_sk(newsk);
1382         ireq                  = inet_rsk(req);
1383         newinet->daddr        = ireq->rmt_addr;
1384         newinet->rcv_saddr    = ireq->loc_addr;
1385         newinet->saddr        = ireq->loc_addr;
1386         newinet->opt          = ireq->opt;
1387         ireq->opt             = NULL;
1388         newinet->mc_index     = inet_iif(skb);
1389         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1390         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1391         if (newinet->opt)
1392                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1393         newinet->id = newtp->write_seq ^ jiffies;
1394
1395         tcp_mtup_init(newsk);
1396         tcp_sync_mss(newsk, dst_mtu(dst));
1397         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1398         if (tcp_sk(sk)->rx_opt.user_mss &&
1399             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1400                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1401
1402         tcp_initialize_rcv_mss(newsk);
1403
1404 #ifdef CONFIG_TCP_MD5SIG
1405         /* Copy over the MD5 key from the original socket */
1406         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1407                 /*
1408                  * We're using one, so create a matching key
1409                  * on the newsk structure. If we fail to get
1410                  * memory, then we end up not copying the key
1411                  * across. Shucks.
1412                  */
1413                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1414                 if (newkey != NULL)
1415                         tcp_v4_md5_do_add(newsk, newinet->daddr,
1416                                           newkey, key->keylen);
1417                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1418         }
1419 #endif
1420
1421         __inet_hash_nolisten(newsk);
1422         __inet_inherit_port(sk, newsk);
1423
1424         return newsk;
1425
1426 exit_overflow:
1427         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1428 exit:
1429         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1430         dst_release(dst);
1431         return NULL;
1432 }
1433
1434 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1435 {
1436         struct tcphdr *th = tcp_hdr(skb);
1437         const struct iphdr *iph = ip_hdr(skb);
1438         struct sock *nsk;
1439         struct request_sock **prev;
1440         /* Find possible connection requests. */
1441         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1442                                                        iph->saddr, iph->daddr);
1443         if (req)
1444                 return tcp_check_req(sk, skb, req, prev);
1445
1446         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1447                         th->source, iph->daddr, th->dest, inet_iif(skb));
1448
1449         if (nsk) {
1450                 if (nsk->sk_state != TCP_TIME_WAIT) {
1451                         bh_lock_sock(nsk);
1452                         return nsk;
1453                 }
1454                 inet_twsk_put(inet_twsk(nsk));
1455                 return NULL;
1456         }
1457
1458 #ifdef CONFIG_SYN_COOKIES
1459         if (!th->rst && !th->syn && th->ack)
1460                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1461 #endif
1462         return sk;
1463 }
1464
1465 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1466 {
1467         const struct iphdr *iph = ip_hdr(skb);
1468
1469         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1470                 if (!tcp_v4_check(skb->len, iph->saddr,
1471                                   iph->daddr, skb->csum)) {
1472                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1473                         return 0;
1474                 }
1475         }
1476
1477         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1478                                        skb->len, IPPROTO_TCP, 0);
1479
1480         if (skb->len <= 76) {
1481                 return __skb_checksum_complete(skb);
1482         }
1483         return 0;
1484 }
1485
1486
1487 /* The socket must have it's spinlock held when we get
1488  * here.
1489  *
1490  * We have a potential double-lock case here, so even when
1491  * doing backlog processing we use the BH locking scheme.
1492  * This is because we cannot sleep with the original spinlock
1493  * held.
1494  */
1495 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct sock *rsk;
1498 #ifdef CONFIG_TCP_MD5SIG
1499         /*
1500          * We really want to reject the packet as early as possible
1501          * if:
1502          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1503          *  o There is an MD5 option and we're not expecting one
1504          */
1505         if (tcp_v4_inbound_md5_hash(sk, skb))
1506                 goto discard;
1507 #endif
1508
1509         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1510                 TCP_CHECK_TIMER(sk);
1511                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1512                         rsk = sk;
1513                         goto reset;
1514                 }
1515                 TCP_CHECK_TIMER(sk);
1516                 return 0;
1517         }
1518
1519         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1520                 goto csum_err;
1521
1522         if (sk->sk_state == TCP_LISTEN) {
1523                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1524                 if (!nsk)
1525                         goto discard;
1526
1527                 if (nsk != sk) {
1528                         if (tcp_child_process(sk, nsk, skb)) {
1529                                 rsk = nsk;
1530                                 goto reset;
1531                         }
1532                         return 0;
1533                 }
1534         }
1535
1536         TCP_CHECK_TIMER(sk);
1537         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1538                 rsk = sk;
1539                 goto reset;
1540         }
1541         TCP_CHECK_TIMER(sk);
1542         return 0;
1543
1544 reset:
1545         tcp_v4_send_reset(rsk, skb);
1546 discard:
1547         kfree_skb(skb);
1548         /* Be careful here. If this function gets more complicated and
1549          * gcc suffers from register pressure on the x86, sk (in %ebx)
1550          * might be destroyed here. This current version compiles correctly,
1551          * but you have been warned.
1552          */
1553         return 0;
1554
1555 csum_err:
1556         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1557         goto discard;
1558 }
1559
1560 /*
1561  *      From tcp_input.c
1562  */
1563
1564 int tcp_v4_rcv(struct sk_buff *skb)
1565 {
1566         const struct iphdr *iph;
1567         struct tcphdr *th;
1568         struct sock *sk;
1569         int ret;
1570         struct net *net = dev_net(skb->dev);
1571
1572         if (skb->pkt_type != PACKET_HOST)
1573                 goto discard_it;
1574
1575         /* Count it even if it's bad */
1576         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1577
1578         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1579                 goto discard_it;
1580
1581         th = tcp_hdr(skb);
1582
1583         if (th->doff < sizeof(struct tcphdr) / 4)
1584                 goto bad_packet;
1585         if (!pskb_may_pull(skb, th->doff * 4))
1586                 goto discard_it;
1587
1588         /* An explanation is required here, I think.
1589          * Packet length and doff are validated by header prediction,
1590          * provided case of th->doff==0 is eliminated.
1591          * So, we defer the checks. */
1592         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1593                 goto bad_packet;
1594
1595         th = tcp_hdr(skb);
1596         iph = ip_hdr(skb);
1597         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1598         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1599                                     skb->len - th->doff * 4);
1600         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1601         TCP_SKB_CB(skb)->when    = 0;
1602         TCP_SKB_CB(skb)->flags   = iph->tos;
1603         TCP_SKB_CB(skb)->sacked  = 0;
1604
1605         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1606         if (!sk)
1607                 goto no_tcp_socket;
1608
1609 process:
1610         if (sk->sk_state == TCP_TIME_WAIT)
1611                 goto do_time_wait;
1612
1613         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1614                 goto discard_and_relse;
1615         nf_reset(skb);
1616
1617         if (sk_filter(sk, skb))
1618                 goto discard_and_relse;
1619
1620         skb->dev = NULL;
1621
1622         bh_lock_sock_nested(sk);
1623         ret = 0;
1624         if (!sock_owned_by_user(sk)) {
1625 #ifdef CONFIG_NET_DMA
1626                 struct tcp_sock *tp = tcp_sk(sk);
1627                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1628                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1629                 if (tp->ucopy.dma_chan)
1630                         ret = tcp_v4_do_rcv(sk, skb);
1631                 else
1632 #endif
1633                 {
1634                         if (!tcp_prequeue(sk, skb))
1635                                 ret = tcp_v4_do_rcv(sk, skb);
1636                 }
1637         } else
1638                 sk_add_backlog(sk, skb);
1639         bh_unlock_sock(sk);
1640
1641         sock_put(sk);
1642
1643         return ret;
1644
1645 no_tcp_socket:
1646         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1647                 goto discard_it;
1648
1649         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1650 bad_packet:
1651                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1652         } else {
1653                 tcp_v4_send_reset(NULL, skb);
1654         }
1655
1656 discard_it:
1657         /* Discard frame. */
1658         kfree_skb(skb);
1659         return 0;
1660
1661 discard_and_relse:
1662         sock_put(sk);
1663         goto discard_it;
1664
1665 do_time_wait:
1666         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1667                 inet_twsk_put(inet_twsk(sk));
1668                 goto discard_it;
1669         }
1670
1671         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1672                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1673                 inet_twsk_put(inet_twsk(sk));
1674                 goto discard_it;
1675         }
1676         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1677         case TCP_TW_SYN: {
1678                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1679                                                         &tcp_hashinfo,
1680                                                         iph->daddr, th->dest,
1681                                                         inet_iif(skb));
1682                 if (sk2) {
1683                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1684                         inet_twsk_put(inet_twsk(sk));
1685                         sk = sk2;
1686                         goto process;
1687                 }
1688                 /* Fall through to ACK */
1689         }
1690         case TCP_TW_ACK:
1691                 tcp_v4_timewait_ack(sk, skb);
1692                 break;
1693         case TCP_TW_RST:
1694                 goto no_tcp_socket;
1695         case TCP_TW_SUCCESS:;
1696         }
1697         goto discard_it;
1698 }
1699
1700 /* VJ's idea. Save last timestamp seen from this destination
1701  * and hold it at least for normal timewait interval to use for duplicate
1702  * segment detection in subsequent connections, before they enter synchronized
1703  * state.
1704  */
1705
1706 int tcp_v4_remember_stamp(struct sock *sk)
1707 {
1708         struct inet_sock *inet = inet_sk(sk);
1709         struct tcp_sock *tp = tcp_sk(sk);
1710         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1711         struct inet_peer *peer = NULL;
1712         int release_it = 0;
1713
1714         if (!rt || rt->rt_dst != inet->daddr) {
1715                 peer = inet_getpeer(inet->daddr, 1);
1716                 release_it = 1;
1717         } else {
1718                 if (!rt->peer)
1719                         rt_bind_peer(rt, 1);
1720                 peer = rt->peer;
1721         }
1722
1723         if (peer) {
1724                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1726                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1727                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1728                         peer->tcp_ts = tp->rx_opt.ts_recent;
1729                 }
1730                 if (release_it)
1731                         inet_putpeer(peer);
1732                 return 1;
1733         }
1734
1735         return 0;
1736 }
1737
1738 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1739 {
1740         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1741
1742         if (peer) {
1743                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1744
1745                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1746                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1747                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1748                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1749                         peer->tcp_ts       = tcptw->tw_ts_recent;
1750                 }
1751                 inet_putpeer(peer);
1752                 return 1;
1753         }
1754
1755         return 0;
1756 }
1757
1758 const struct inet_connection_sock_af_ops ipv4_specific = {
1759         .queue_xmit        = ip_queue_xmit,
1760         .send_check        = tcp_v4_send_check,
1761         .rebuild_header    = inet_sk_rebuild_header,
1762         .conn_request      = tcp_v4_conn_request,
1763         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1764         .remember_stamp    = tcp_v4_remember_stamp,
1765         .net_header_len    = sizeof(struct iphdr),
1766         .setsockopt        = ip_setsockopt,
1767         .getsockopt        = ip_getsockopt,
1768         .addr2sockaddr     = inet_csk_addr2sockaddr,
1769         .sockaddr_len      = sizeof(struct sockaddr_in),
1770         .bind_conflict     = inet_csk_bind_conflict,
1771 #ifdef CONFIG_COMPAT
1772         .compat_setsockopt = compat_ip_setsockopt,
1773         .compat_getsockopt = compat_ip_getsockopt,
1774 #endif
1775 };
1776
1777 #ifdef CONFIG_TCP_MD5SIG
1778 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1779         .md5_lookup             = tcp_v4_md5_lookup,
1780         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1781         .md5_add                = tcp_v4_md5_add_func,
1782         .md5_parse              = tcp_v4_parse_md5_keys,
1783 };
1784 #endif
1785
1786 /* NOTE: A lot of things set to zero explicitly by call to
1787  *       sk_alloc() so need not be done here.
1788  */
1789 static int tcp_v4_init_sock(struct sock *sk)
1790 {
1791         struct inet_connection_sock *icsk = inet_csk(sk);
1792         struct tcp_sock *tp = tcp_sk(sk);
1793
1794         skb_queue_head_init(&tp->out_of_order_queue);
1795         tcp_init_xmit_timers(sk);
1796         tcp_prequeue_init(tp);
1797
1798         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1799         tp->mdev = TCP_TIMEOUT_INIT;
1800
1801         /* So many TCP implementations out there (incorrectly) count the
1802          * initial SYN frame in their delayed-ACK and congestion control
1803          * algorithms that we must have the following bandaid to talk
1804          * efficiently to them.  -DaveM
1805          */
1806         tp->snd_cwnd = 2;
1807
1808         /* See draft-stevens-tcpca-spec-01 for discussion of the
1809          * initialization of these values.
1810          */
1811         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1812         tp->snd_cwnd_clamp = ~0;
1813         tp->mss_cache = 536;
1814
1815         tp->reordering = sysctl_tcp_reordering;
1816         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1817
1818         sk->sk_state = TCP_CLOSE;
1819
1820         sk->sk_write_space = sk_stream_write_space;
1821         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1822
1823         icsk->icsk_af_ops = &ipv4_specific;
1824         icsk->icsk_sync_mss = tcp_sync_mss;
1825 #ifdef CONFIG_TCP_MD5SIG
1826         tp->af_specific = &tcp_sock_ipv4_specific;
1827 #endif
1828
1829         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1830         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1831
1832         local_bh_disable();
1833         percpu_counter_inc(&tcp_sockets_allocated);
1834         local_bh_enable();
1835
1836         return 0;
1837 }
1838
1839 void tcp_v4_destroy_sock(struct sock *sk)
1840 {
1841         struct tcp_sock *tp = tcp_sk(sk);
1842
1843         tcp_clear_xmit_timers(sk);
1844
1845         tcp_cleanup_congestion_control(sk);
1846
1847         /* Cleanup up the write buffer. */
1848         tcp_write_queue_purge(sk);
1849
1850         /* Cleans up our, hopefully empty, out_of_order_queue. */
1851         __skb_queue_purge(&tp->out_of_order_queue);
1852
1853 #ifdef CONFIG_TCP_MD5SIG
1854         /* Clean up the MD5 key list, if any */
1855         if (tp->md5sig_info) {
1856                 tcp_v4_clear_md5_list(sk);
1857                 kfree(tp->md5sig_info);
1858                 tp->md5sig_info = NULL;
1859         }
1860 #endif
1861
1862 #ifdef CONFIG_NET_DMA
1863         /* Cleans up our sk_async_wait_queue */
1864         __skb_queue_purge(&sk->sk_async_wait_queue);
1865 #endif
1866
1867         /* Clean prequeue, it must be empty really */
1868         __skb_queue_purge(&tp->ucopy.prequeue);
1869
1870         /* Clean up a referenced TCP bind bucket. */
1871         if (inet_csk(sk)->icsk_bind_hash)
1872                 inet_put_port(sk);
1873
1874         /*
1875          * If sendmsg cached page exists, toss it.
1876          */
1877         if (sk->sk_sndmsg_page) {
1878                 __free_page(sk->sk_sndmsg_page);
1879                 sk->sk_sndmsg_page = NULL;
1880         }
1881
1882         percpu_counter_dec(&tcp_sockets_allocated);
1883 }
1884
1885 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1886
1887 #ifdef CONFIG_PROC_FS
1888 /* Proc filesystem TCP sock list dumping. */
1889
1890 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1891 {
1892         return hlist_nulls_empty(head) ? NULL :
1893                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1894 }
1895
1896 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1897 {
1898         return !is_a_nulls(tw->tw_node.next) ?
1899                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1900 }
1901
1902 static void *listening_get_next(struct seq_file *seq, void *cur)
1903 {
1904         struct inet_connection_sock *icsk;
1905         struct hlist_nulls_node *node;
1906         struct sock *sk = cur;
1907         struct inet_listen_hashbucket *ilb;
1908         struct tcp_iter_state *st = seq->private;
1909         struct net *net = seq_file_net(seq);
1910
1911         if (!sk) {
1912                 st->bucket = 0;
1913                 ilb = &tcp_hashinfo.listening_hash[0];
1914                 spin_lock_bh(&ilb->lock);
1915                 sk = sk_nulls_head(&ilb->head);
1916                 goto get_sk;
1917         }
1918         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1919         ++st->num;
1920
1921         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1922                 struct request_sock *req = cur;
1923
1924                 icsk = inet_csk(st->syn_wait_sk);
1925                 req = req->dl_next;
1926                 while (1) {
1927                         while (req) {
1928                                 if (req->rsk_ops->family == st->family) {
1929                                         cur = req;
1930                                         goto out;
1931                                 }
1932                                 req = req->dl_next;
1933                         }
1934                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1935                                 break;
1936 get_req:
1937                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1938                 }
1939                 sk        = sk_next(st->syn_wait_sk);
1940                 st->state = TCP_SEQ_STATE_LISTENING;
1941                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1942         } else {
1943                 icsk = inet_csk(sk);
1944                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1945                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1946                         goto start_req;
1947                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1948                 sk = sk_next(sk);
1949         }
1950 get_sk:
1951         sk_nulls_for_each_from(sk, node) {
1952                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1953                         cur = sk;
1954                         goto out;
1955                 }
1956                 icsk = inet_csk(sk);
1957                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1958                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1959 start_req:
1960                         st->uid         = sock_i_uid(sk);
1961                         st->syn_wait_sk = sk;
1962                         st->state       = TCP_SEQ_STATE_OPENREQ;
1963                         st->sbucket     = 0;
1964                         goto get_req;
1965                 }
1966                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1967         }
1968         spin_unlock_bh(&ilb->lock);
1969         if (++st->bucket < INET_LHTABLE_SIZE) {
1970                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1971                 spin_lock_bh(&ilb->lock);
1972                 sk = sk_nulls_head(&ilb->head);
1973                 goto get_sk;
1974         }
1975         cur = NULL;
1976 out:
1977         return cur;
1978 }
1979
1980 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1981 {
1982         void *rc = listening_get_next(seq, NULL);
1983
1984         while (rc && *pos) {
1985                 rc = listening_get_next(seq, rc);
1986                 --*pos;
1987         }
1988         return rc;
1989 }
1990
1991 static inline int empty_bucket(struct tcp_iter_state *st)
1992 {
1993         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1994                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1995 }
1996
1997 static void *established_get_first(struct seq_file *seq)
1998 {
1999         struct tcp_iter_state *st = seq->private;
2000         struct net *net = seq_file_net(seq);
2001         void *rc = NULL;
2002
2003         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2004                 struct sock *sk;
2005                 struct hlist_nulls_node *node;
2006                 struct inet_timewait_sock *tw;
2007                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2008
2009                 /* Lockless fast path for the common case of empty buckets */
2010                 if (empty_bucket(st))
2011                         continue;
2012
2013                 spin_lock_bh(lock);
2014                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2015                         if (sk->sk_family != st->family ||
2016                             !net_eq(sock_net(sk), net)) {
2017                                 continue;
2018                         }
2019                         rc = sk;
2020                         goto out;
2021                 }
2022                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2023                 inet_twsk_for_each(tw, node,
2024                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2025                         if (tw->tw_family != st->family ||
2026                             !net_eq(twsk_net(tw), net)) {
2027                                 continue;
2028                         }
2029                         rc = tw;
2030                         goto out;
2031                 }
2032                 spin_unlock_bh(lock);
2033                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034         }
2035 out:
2036         return rc;
2037 }
2038
2039 static void *established_get_next(struct seq_file *seq, void *cur)
2040 {
2041         struct sock *sk = cur;
2042         struct inet_timewait_sock *tw;
2043         struct hlist_nulls_node *node;
2044         struct tcp_iter_state *st = seq->private;
2045         struct net *net = seq_file_net(seq);
2046
2047         ++st->num;
2048
2049         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2050                 tw = cur;
2051                 tw = tw_next(tw);
2052 get_tw:
2053                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2054                         tw = tw_next(tw);
2055                 }
2056                 if (tw) {
2057                         cur = tw;
2058                         goto out;
2059                 }
2060                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2061                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062
2063                 /* Look for next non empty bucket */
2064                 while (++st->bucket < tcp_hashinfo.ehash_size &&
2065                                 empty_bucket(st))
2066                         ;
2067                 if (st->bucket >= tcp_hashinfo.ehash_size)
2068                         return NULL;
2069
2070                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2071                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2072         } else
2073                 sk = sk_nulls_next(sk);
2074
2075         sk_nulls_for_each_from(sk, node) {
2076                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2077                         goto found;
2078         }
2079
2080         st->state = TCP_SEQ_STATE_TIME_WAIT;
2081         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2082         goto get_tw;
2083 found:
2084         cur = sk;
2085 out:
2086         return cur;
2087 }
2088
2089 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2090 {
2091         void *rc = established_get_first(seq);
2092
2093         while (rc && pos) {
2094                 rc = established_get_next(seq, rc);
2095                 --pos;
2096         }
2097         return rc;
2098 }
2099
2100 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2101 {
2102         void *rc;
2103         struct tcp_iter_state *st = seq->private;
2104
2105         st->state = TCP_SEQ_STATE_LISTENING;
2106         rc        = listening_get_idx(seq, &pos);
2107
2108         if (!rc) {
2109                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2110                 rc        = established_get_idx(seq, pos);
2111         }
2112
2113         return rc;
2114 }
2115
2116 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2117 {
2118         struct tcp_iter_state *st = seq->private;
2119         st->state = TCP_SEQ_STATE_LISTENING;
2120         st->num = 0;
2121         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2122 }
2123
2124 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2125 {
2126         void *rc = NULL;
2127         struct tcp_iter_state *st;
2128
2129         if (v == SEQ_START_TOKEN) {
2130                 rc = tcp_get_idx(seq, 0);
2131                 goto out;
2132         }
2133         st = seq->private;
2134
2135         switch (st->state) {
2136         case TCP_SEQ_STATE_OPENREQ:
2137         case TCP_SEQ_STATE_LISTENING:
2138                 rc = listening_get_next(seq, v);
2139                 if (!rc) {
2140                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2141                         rc        = established_get_first(seq);
2142                 }
2143                 break;
2144         case TCP_SEQ_STATE_ESTABLISHED:
2145         case TCP_SEQ_STATE_TIME_WAIT:
2146                 rc = established_get_next(seq, v);
2147                 break;
2148         }
2149 out:
2150         ++*pos;
2151         return rc;
2152 }
2153
2154 static void tcp_seq_stop(struct seq_file *seq, void *v)
2155 {
2156         struct tcp_iter_state *st = seq->private;
2157
2158         switch (st->state) {
2159         case TCP_SEQ_STATE_OPENREQ:
2160                 if (v) {
2161                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2162                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2163                 }
2164         case TCP_SEQ_STATE_LISTENING:
2165                 if (v != SEQ_START_TOKEN)
2166                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2167                 break;
2168         case TCP_SEQ_STATE_TIME_WAIT:
2169         case TCP_SEQ_STATE_ESTABLISHED:
2170                 if (v)
2171                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2172                 break;
2173         }
2174 }
2175
2176 static int tcp_seq_open(struct inode *inode, struct file *file)
2177 {
2178         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2179         struct tcp_iter_state *s;
2180         int err;
2181
2182         err = seq_open_net(inode, file, &afinfo->seq_ops,
2183                           sizeof(struct tcp_iter_state));
2184         if (err < 0)
2185                 return err;
2186
2187         s = ((struct seq_file *)file->private_data)->private;
2188         s->family               = afinfo->family;
2189         return 0;
2190 }
2191
2192 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2193 {
2194         int rc = 0;
2195         struct proc_dir_entry *p;
2196
2197         afinfo->seq_fops.open           = tcp_seq_open;
2198         afinfo->seq_fops.read           = seq_read;
2199         afinfo->seq_fops.llseek         = seq_lseek;
2200         afinfo->seq_fops.release        = seq_release_net;
2201
2202         afinfo->seq_ops.start           = tcp_seq_start;
2203         afinfo->seq_ops.next            = tcp_seq_next;
2204         afinfo->seq_ops.stop            = tcp_seq_stop;
2205
2206         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2207                              &afinfo->seq_fops, afinfo);
2208         if (!p)
2209                 rc = -ENOMEM;
2210         return rc;
2211 }
2212
2213 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2214 {
2215         proc_net_remove(net, afinfo->name);
2216 }
2217
2218 static void get_openreq4(struct sock *sk, struct request_sock *req,
2219                          struct seq_file *f, int i, int uid, int *len)
2220 {
2221         const struct inet_request_sock *ireq = inet_rsk(req);
2222         int ttd = req->expires - jiffies;
2223
2224         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2225                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2226                 i,
2227                 ireq->loc_addr,
2228                 ntohs(inet_sk(sk)->sport),
2229                 ireq->rmt_addr,
2230                 ntohs(ireq->rmt_port),
2231                 TCP_SYN_RECV,
2232                 0, 0, /* could print option size, but that is af dependent. */
2233                 1,    /* timers active (only the expire timer) */
2234                 jiffies_to_clock_t(ttd),
2235                 req->retrans,
2236                 uid,
2237                 0,  /* non standard timer */
2238                 0, /* open_requests have no inode */
2239                 atomic_read(&sk->sk_refcnt),
2240                 req,
2241                 len);
2242 }
2243
2244 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2245 {
2246         int timer_active;
2247         unsigned long timer_expires;
2248         struct tcp_sock *tp = tcp_sk(sk);
2249         const struct inet_connection_sock *icsk = inet_csk(sk);
2250         struct inet_sock *inet = inet_sk(sk);
2251         __be32 dest = inet->daddr;
2252         __be32 src = inet->rcv_saddr;
2253         __u16 destp = ntohs(inet->dport);
2254         __u16 srcp = ntohs(inet->sport);
2255
2256         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2257                 timer_active    = 1;
2258                 timer_expires   = icsk->icsk_timeout;
2259         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2260                 timer_active    = 4;
2261                 timer_expires   = icsk->icsk_timeout;
2262         } else if (timer_pending(&sk->sk_timer)) {
2263                 timer_active    = 2;
2264                 timer_expires   = sk->sk_timer.expires;
2265         } else {
2266                 timer_active    = 0;
2267                 timer_expires = jiffies;
2268         }
2269
2270         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2271                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2272                 i, src, srcp, dest, destp, sk->sk_state,
2273                 tp->write_seq - tp->snd_una,
2274                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2275                                              (tp->rcv_nxt - tp->copied_seq),
2276                 timer_active,
2277                 jiffies_to_clock_t(timer_expires - jiffies),
2278                 icsk->icsk_retransmits,
2279                 sock_i_uid(sk),
2280                 icsk->icsk_probes_out,
2281                 sock_i_ino(sk),
2282                 atomic_read(&sk->sk_refcnt), sk,
2283                 jiffies_to_clock_t(icsk->icsk_rto),
2284                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2285                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2286                 tp->snd_cwnd,
2287                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2288                 len);
2289 }
2290
2291 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2292                                struct seq_file *f, int i, int *len)
2293 {
2294         __be32 dest, src;
2295         __u16 destp, srcp;
2296         int ttd = tw->tw_ttd - jiffies;
2297
2298         if (ttd < 0)
2299                 ttd = 0;
2300
2301         dest  = tw->tw_daddr;
2302         src   = tw->tw_rcv_saddr;
2303         destp = ntohs(tw->tw_dport);
2304         srcp  = ntohs(tw->tw_sport);
2305
2306         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2307                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2308                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2309                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2310                 atomic_read(&tw->tw_refcnt), tw, len);
2311 }
2312
2313 #define TMPSZ 150
2314
2315 static int tcp4_seq_show(struct seq_file *seq, void *v)
2316 {
2317         struct tcp_iter_state *st;
2318         int len;
2319
2320         if (v == SEQ_START_TOKEN) {
2321                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2322                            "  sl  local_address rem_address   st tx_queue "
2323                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2324                            "inode");
2325                 goto out;
2326         }
2327         st = seq->private;
2328
2329         switch (st->state) {
2330         case TCP_SEQ_STATE_LISTENING:
2331         case TCP_SEQ_STATE_ESTABLISHED:
2332                 get_tcp4_sock(v, seq, st->num, &len);
2333                 break;
2334         case TCP_SEQ_STATE_OPENREQ:
2335                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2336                 break;
2337         case TCP_SEQ_STATE_TIME_WAIT:
2338                 get_timewait4_sock(v, seq, st->num, &len);
2339                 break;
2340         }
2341         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2342 out:
2343         return 0;
2344 }
2345
2346 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2347         .name           = "tcp",
2348         .family         = AF_INET,
2349         .seq_fops       = {
2350                 .owner          = THIS_MODULE,
2351         },
2352         .seq_ops        = {
2353                 .show           = tcp4_seq_show,
2354         },
2355 };
2356
2357 static int tcp4_proc_init_net(struct net *net)
2358 {
2359         return tcp_proc_register(net, &tcp4_seq_afinfo);
2360 }
2361
2362 static void tcp4_proc_exit_net(struct net *net)
2363 {
2364         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2365 }
2366
2367 static struct pernet_operations tcp4_net_ops = {
2368         .init = tcp4_proc_init_net,
2369         .exit = tcp4_proc_exit_net,
2370 };
2371
2372 int __init tcp4_proc_init(void)
2373 {
2374         return register_pernet_subsys(&tcp4_net_ops);
2375 }
2376
2377 void tcp4_proc_exit(void)
2378 {
2379         unregister_pernet_subsys(&tcp4_net_ops);
2380 }
2381 #endif /* CONFIG_PROC_FS */
2382
2383 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2384 {
2385         struct iphdr *iph = skb_gro_network_header(skb);
2386
2387         switch (skb->ip_summed) {
2388         case CHECKSUM_COMPLETE:
2389                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2390                                   skb->csum)) {
2391                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2392                         break;
2393                 }
2394
2395                 /* fall through */
2396         case CHECKSUM_NONE:
2397                 NAPI_GRO_CB(skb)->flush = 1;
2398                 return NULL;
2399         }
2400
2401         return tcp_gro_receive(head, skb);
2402 }
2403 EXPORT_SYMBOL(tcp4_gro_receive);
2404
2405 int tcp4_gro_complete(struct sk_buff *skb)
2406 {
2407         struct iphdr *iph = ip_hdr(skb);
2408         struct tcphdr *th = tcp_hdr(skb);
2409
2410         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2411                                   iph->saddr, iph->daddr, 0);
2412         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2413
2414         return tcp_gro_complete(skb);
2415 }
2416 EXPORT_SYMBOL(tcp4_gro_complete);
2417
2418 struct proto tcp_prot = {
2419         .name                   = "TCP",
2420         .owner                  = THIS_MODULE,
2421         .close                  = tcp_close,
2422         .connect                = tcp_v4_connect,
2423         .disconnect             = tcp_disconnect,
2424         .accept                 = inet_csk_accept,
2425         .ioctl                  = tcp_ioctl,
2426         .init                   = tcp_v4_init_sock,
2427         .destroy                = tcp_v4_destroy_sock,
2428         .shutdown               = tcp_shutdown,
2429         .setsockopt             = tcp_setsockopt,
2430         .getsockopt             = tcp_getsockopt,
2431         .recvmsg                = tcp_recvmsg,
2432         .backlog_rcv            = tcp_v4_do_rcv,
2433         .hash                   = inet_hash,
2434         .unhash                 = inet_unhash,
2435         .get_port               = inet_csk_get_port,
2436         .enter_memory_pressure  = tcp_enter_memory_pressure,
2437         .sockets_allocated      = &tcp_sockets_allocated,
2438         .orphan_count           = &tcp_orphan_count,
2439         .memory_allocated       = &tcp_memory_allocated,
2440         .memory_pressure        = &tcp_memory_pressure,
2441         .sysctl_mem             = sysctl_tcp_mem,
2442         .sysctl_wmem            = sysctl_tcp_wmem,
2443         .sysctl_rmem            = sysctl_tcp_rmem,
2444         .max_header             = MAX_TCP_HEADER,
2445         .obj_size               = sizeof(struct tcp_sock),
2446         .slab_flags             = SLAB_DESTROY_BY_RCU,
2447         .twsk_prot              = &tcp_timewait_sock_ops,
2448         .rsk_prot               = &tcp_request_sock_ops,
2449         .h.hashinfo             = &tcp_hashinfo,
2450 #ifdef CONFIG_COMPAT
2451         .compat_setsockopt      = compat_tcp_setsockopt,
2452         .compat_getsockopt      = compat_tcp_getsockopt,
2453 #endif
2454 };
2455
2456
2457 static int __net_init tcp_sk_init(struct net *net)
2458 {
2459         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2460                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2461 }
2462
2463 static void __net_exit tcp_sk_exit(struct net *net)
2464 {
2465         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2466         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2467 }
2468
2469 static struct pernet_operations __net_initdata tcp_sk_ops = {
2470        .init = tcp_sk_init,
2471        .exit = tcp_sk_exit,
2472 };
2473
2474 void __init tcp_v4_init(void)
2475 {
2476         inet_hashinfo_init(&tcp_hashinfo);
2477         if (register_pernet_subsys(&tcp_sk_ops))
2478                 panic("Failed to create the TCP control socket.\n");
2479 }
2480
2481 EXPORT_SYMBOL(ipv4_specific);
2482 EXPORT_SYMBOL(tcp_hashinfo);
2483 EXPORT_SYMBOL(tcp_prot);
2484 EXPORT_SYMBOL(tcp_v4_conn_request);
2485 EXPORT_SYMBOL(tcp_v4_connect);
2486 EXPORT_SYMBOL(tcp_v4_do_rcv);
2487 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2488 EXPORT_SYMBOL(tcp_v4_send_check);
2489 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2490
2491 #ifdef CONFIG_PROC_FS
2492 EXPORT_SYMBOL(tcp_proc_register);
2493 EXPORT_SYMBOL(tcp_proc_unregister);
2494 #endif
2495 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2496