net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_low_latency __read_mostly;
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
  98 {
  99         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 100                                           ip_hdr(skb)->saddr,
 101                                           tcp_hdr(skb)->dest,
 102                                           tcp_hdr(skb)->source, tsoff);
 103 }
 104
 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106 {
 107         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108         struct tcp_sock *tp = tcp_sk(sk);
 109
 110         /* With PAWS, it is safe from the viewpoint
 111            of data integrity. Even without PAWS it is safe provided sequence
 112            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114            Actually, the idea is close to VJ's one, only timestamp cache is
 115            held not per host, but per port pair and TW bucket is used as state
 116            holder.
 117
 118            If TW bucket has been already destroyed we fall back to VJ's scheme
 119            and use initial timestamp retrieved from peer table.
 120          */
 121         if (tcptw->tw_ts_recent_stamp &&
 122             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                 if (tp->write_seq == 0)
 126                         tp->write_seq = 1;
 127                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                 sock_hold(sktw);
 130                 return 1;
 131         }
 132
 133         return 0;
 134 }
 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137 /* This will initiate an outgoing connection. */
 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139 {
 140         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141         struct inet_sock *inet = inet_sk(sk);
 142         struct tcp_sock *tp = tcp_sk(sk);
 143         __be16 orig_sport, orig_dport;
 144         __be32 daddr, nexthop;
 145         struct flowi4 *fl4;
 146         struct rtable *rt;
 147         int err;
 148         struct ip_options_rcu *inet_opt;
 149         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row->sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port,
 241                                                            &tp->tsoffset);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 /*
 569  *      This routine will send an RST to the other tcp.
 570  *
 571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 572  *                    for reset.
 573  *      Answer: if a packet caused RST, it is not for a socket
 574  *              existing in our system, if it is matched to a socket,
 575  *              it is just duplicate segment or bug in other side's TCP.
 576  *              So that we build reply only basing on parameters
 577  *              arrived with segment.
 578  *      Exception: precedence violation. We do not implement it in any case.
 579  */
 580
 581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 582 {
 583         const struct tcphdr *th = tcp_hdr(skb);
 584         struct {
 585                 struct tcphdr th;
 586 #ifdef CONFIG_TCP_MD5SIG
 587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 588 #endif
 589         } rep;
 590         struct ip_reply_arg arg;
 591 #ifdef CONFIG_TCP_MD5SIG
 592         struct tcp_md5sig_key *key = NULL;
 593         const __u8 *hash_location = NULL;
 594         unsigned char newhash[16];
 595         int genhash;
 596         struct sock *sk1 = NULL;
 597 #endif
 598         struct net *net;
 599
 600         /* Never send a reset in response to a reset. */
 601         if (th->rst)
 602                 return;
 603
 604         /* If sk not NULL, it means we did a successful lookup and incoming
 605          * route had to be correct. prequeue might have dropped our dst.
 606          */
 607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 630 #ifdef CONFIG_TCP_MD5SIG
 631         rcu_read_lock();
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (sk && sk_fullsock(sk)) {
 634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 635                                         &ip_hdr(skb)->saddr, AF_INET);
 636         } else if (hash_location) {
 637                 /*
 638                  * active side is lost. Try to find listening socket through
 639                  * source port, and then find md5 key through listening socket.
 640                  * we are not loose security here:
 641                  * Incoming packet is checked with md5 hash with finding key,
 642                  * no RST generated if md5 hash doesn't match.
 643                  */
 644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 645                                              ip_hdr(skb)->saddr,
 646                                              th->source, ip_hdr(skb)->daddr,
 647                                              ntohs(th->source), inet_iif(skb));
 648                 /* don't send rst if it can't find key */
 649                 if (!sk1)
 650                         goto out;
 651
 652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654                 if (!key)
 655                         goto out;
 656
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto out;
 661
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683
 684         /* When socket is gone, all binding information is lost.
 685          * routing might fail in this case. No choice here, if we choose to force
 686          * input interface, we will misroute in case of asymmetric route.
 687          */
 688         if (sk)
 689                 arg.bound_dev_if = sk->sk_bound_dev_if;
 690
 691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 696         local_bh_disable();
 697         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 698                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 699                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 700                               &arg, arg.iov[0].iov_len);
 701
 702         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 703         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 704         local_bh_enable();
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707 out:
 708         rcu_read_unlock();
 709 #endif
 710 }
 711
 712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 713    outside socket context is ugly, certainly. What can I do?
 714  */
 715
 716 static void tcp_v4_send_ack(const struct sock *sk,
 717                             struct sk_buff *skb, u32 seq, u32 ack,
 718                             u32 win, u32 tsval, u32 tsecr, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct net *net = sock_net(sk);
 732         struct ip_reply_arg arg;
 733
 734         memset(&rep.th, 0, sizeof(struct tcphdr));
 735         memset(&arg, 0, sizeof(arg));
 736
 737         arg.iov[0].iov_base = (unsigned char *)&rep;
 738         arg.iov[0].iov_len  = sizeof(rep.th);
 739         if (tsecr) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 741                                    (TCPOPT_TIMESTAMP << 8) |
 742                                    TCPOLEN_TIMESTAMP);
 743                 rep.opt[1] = htonl(tsval);
 744                 rep.opt[2] = htonl(tsecr);
 745                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 746         }
 747
 748         /* Swap the send and the receive. */
 749         rep.th.dest    = th->source;
 750         rep.th.source  = th->dest;
 751         rep.th.doff    = arg.iov[0].iov_len / 4;
 752         rep.th.seq     = htonl(seq);
 753         rep.th.ack_seq = htonl(ack);
 754         rep.th.ack     = 1;
 755         rep.th.window  = htons(win);
 756
 757 #ifdef CONFIG_TCP_MD5SIG
 758         if (key) {
 759                 int offset = (tsecr) ? 3 : 0;
 760
 761                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 762                                           (TCPOPT_NOP << 16) |
 763                                           (TCPOPT_MD5SIG << 8) |
 764                                           TCPOLEN_MD5SIG);
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len/4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 769                                     key, ip_hdr(skb)->saddr,
 770                                     ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.flags = reply_flags;
 774         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 775                                       ip_hdr(skb)->saddr, /* XXX */
 776                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 777         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 778         if (oif)
 779                 arg.bound_dev_if = oif;
 780         arg.tos = tos;
 781         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 782         local_bh_disable();
 783         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 784                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 785                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 786                               &arg, arg.iov[0].iov_len);
 787
 788         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 789         local_bh_enable();
 790 }
 791
 792 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 793 {
 794         struct inet_timewait_sock *tw = inet_twsk(sk);
 795         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 796
 797         tcp_v4_send_ack(sk, skb,
 798                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 799                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 800                         tcp_time_stamp + tcptw->tw_ts_offset,
 801                         tcptw->tw_ts_recent,
 802                         tw->tw_bound_dev_if,
 803                         tcp_twsk_md5_key(tcptw),
 804                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 805                         tw->tw_tos
 806                         );
 807
 808         inet_twsk_put(tw);
 809 }
 810
 811 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 812                                   struct request_sock *req)
 813 {
 814         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 815          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 816          */
 817         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 818                                              tcp_sk(sk)->snd_nxt;
 819
 820         /* RFC 7323 2.3
 821          * The window field (SEG.WND) of every outgoing segment, with the
 822          * exception of <SYN> segments, MUST be right-shifted by
 823          * Rcv.Wind.Shift bits:
 824          */
 825         tcp_v4_send_ack(sk, skb, seq,
 826                         tcp_rsk(req)->rcv_nxt,
 827                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 828                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 829                         req->ts_recent,
 830                         0,
 831                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 832                                           AF_INET),
 833                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 834                         ip_hdr(skb)->tos);
 835 }
 836
 837 /*
 838  *      Send a SYN-ACK after having received a SYN.
 839  *      This still operates on a request_sock only, not on a big
 840  *      socket.
 841  */
 842 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 843                               struct flowi *fl,
 844                               struct request_sock *req,
 845                               struct tcp_fastopen_cookie *foc,
 846                               enum tcp_synack_type synack_type)
 847 {
 848         const struct inet_request_sock *ireq = inet_rsk(req);
 849         struct flowi4 fl4;
 850         int err = -1;
 851         struct sk_buff *skb;
 852
 853         /* First, grab a route. */
 854         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 855                 return -1;
 856
 857         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 858
 859         if (skb) {
 860                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 861
 862                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 863                                             ireq->ir_rmt_addr,
 864                                             ireq->opt);
 865                 err = net_xmit_eval(err);
 866         }
 867
 868         return err;
 869 }
 870
 871 /*
 872  *      IPv4 request_sock destructor.
 873  */
 874 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 875 {
 876         kfree(inet_rsk(req)->opt);
 877 }
 878
 879 #ifdef CONFIG_TCP_MD5SIG
 880 /*
 881  * RFC2385 MD5 checksumming requires a mapping of
 882  * IP address->MD5 Key.
 883  * We need to maintain these in the sk structure.
 884  */
 885
 886 /* Find the Key structure for an address.  */
 887 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 888                                          const union tcp_md5_addr *addr,
 889                                          int family)
 890 {
 891         const struct tcp_sock *tp = tcp_sk(sk);
 892         struct tcp_md5sig_key *key;
 893         unsigned int size = sizeof(struct in_addr);
 894         const struct tcp_md5sig_info *md5sig;
 895
 896         /* caller either holds rcu_read_lock() or socket lock */
 897         md5sig = rcu_dereference_check(tp->md5sig_info,
 898                                        lockdep_sock_is_held(sk));
 899         if (!md5sig)
 900                 return NULL;
 901 #if IS_ENABLED(CONFIG_IPV6)
 902         if (family == AF_INET6)
 903                 size = sizeof(struct in6_addr);
 904 #endif
 905         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 906                 if (key->family != family)
 907                         continue;
 908                 if (!memcmp(&key->addr, addr, size))
 909                         return key;
 910         }
 911         return NULL;
 912 }
 913 EXPORT_SYMBOL(tcp_md5_do_lookup);
 914
 915 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 916                                          const struct sock *addr_sk)
 917 {
 918         const union tcp_md5_addr *addr;
 919
 920         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 921         return tcp_md5_do_lookup(sk, addr, AF_INET);
 922 }
 923 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 924
 925 /* This can be called on a newly created socket, from other files */
 926 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 927                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 928 {
 929         /* Add Key to the list */
 930         struct tcp_md5sig_key *key;
 931         struct tcp_sock *tp = tcp_sk(sk);
 932         struct tcp_md5sig_info *md5sig;
 933
 934         key = tcp_md5_do_lookup(sk, addr, family);
 935         if (key) {
 936                 /* Pre-existing entry - just update that one. */
 937                 memcpy(key->key, newkey, newkeylen);
 938                 key->keylen = newkeylen;
 939                 return 0;
 940         }
 941
 942         md5sig = rcu_dereference_protected(tp->md5sig_info,
 943                                            lockdep_sock_is_held(sk));
 944         if (!md5sig) {
 945                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 946                 if (!md5sig)
 947                         return -ENOMEM;
 948
 949                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 950                 INIT_HLIST_HEAD(&md5sig->head);
 951                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 952         }
 953
 954         key = sock_kmalloc(sk, sizeof(*key), gfp);
 955         if (!key)
 956                 return -ENOMEM;
 957         if (!tcp_alloc_md5sig_pool()) {
 958                 sock_kfree_s(sk, key, sizeof(*key));
 959                 return -ENOMEM;
 960         }
 961
 962         memcpy(key->key, newkey, newkeylen);
 963         key->keylen = newkeylen;
 964         key->family = family;
 965         memcpy(&key->addr, addr,
 966                (family == AF_INET6) ? sizeof(struct in6_addr) :
 967                                       sizeof(struct in_addr));
 968         hlist_add_head_rcu(&key->node, &md5sig->head);
 969         return 0;
 970 }
 971 EXPORT_SYMBOL(tcp_md5_do_add);
 972
 973 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 974 {
 975         struct tcp_md5sig_key *key;
 976
 977         key = tcp_md5_do_lookup(sk, addr, family);
 978         if (!key)
 979                 return -ENOENT;
 980         hlist_del_rcu(&key->node);
 981         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 982         kfree_rcu(key, rcu);
 983         return 0;
 984 }
 985 EXPORT_SYMBOL(tcp_md5_do_del);
 986
 987 static void tcp_clear_md5_list(struct sock *sk)
 988 {
 989         struct tcp_sock *tp = tcp_sk(sk);
 990         struct tcp_md5sig_key *key;
 991         struct hlist_node *n;
 992         struct tcp_md5sig_info *md5sig;
 993
 994         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 995
 996         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 997                 hlist_del_rcu(&key->node);
 998                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 999                 kfree_rcu(key, rcu);
1000         }
1001 }
1002
1003 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004                                  int optlen)
1005 {
1006         struct tcp_md5sig cmd;
1007         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008
1009         if (optlen < sizeof(cmd))
1010                 return -EINVAL;
1011
1012         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1013                 return -EFAULT;
1014
1015         if (sin->sin_family != AF_INET)
1016                 return -EINVAL;
1017
1018         if (!cmd.tcpm_keylen)
1019                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1020                                       AF_INET);
1021
1022         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1023                 return -EINVAL;
1024
1025         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1026                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1027                               GFP_KERNEL);
1028 }
1029
1030 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1031                                    __be32 daddr, __be32 saddr,
1032                                    const struct tcphdr *th, int nbytes)
1033 {
1034         struct tcp4_pseudohdr *bp;
1035         struct scatterlist sg;
1036         struct tcphdr *_th;
1037
1038         bp = hp->scratch;
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = IPPROTO_TCP;
1043         bp->len = cpu_to_be16(nbytes);
1044
1045         _th = (struct tcphdr *)(bp + 1);
1046         memcpy(_th, th, sizeof(*th));
1047         _th->check = 0;
1048
1049         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1050         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1051                                 sizeof(*bp) + sizeof(*th));
1052         return crypto_ahash_update(hp->md5_req);
1053 }
1054
1055 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1056                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1057 {
1058         struct tcp_md5sig_pool *hp;
1059         struct ahash_request *req;
1060
1061         hp = tcp_get_md5sig_pool();
1062         if (!hp)
1063                 goto clear_hash_noput;
1064         req = hp->md5_req;
1065
1066         if (crypto_ahash_init(req))
1067                 goto clear_hash;
1068         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1069                 goto clear_hash;
1070         if (tcp_md5_hash_key(hp, key))
1071                 goto clear_hash;
1072         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1073         if (crypto_ahash_final(req))
1074                 goto clear_hash;
1075
1076         tcp_put_md5sig_pool();
1077         return 0;
1078
1079 clear_hash:
1080         tcp_put_md5sig_pool();
1081 clear_hash_noput:
1082         memset(md5_hash, 0, 16);
1083         return 1;
1084 }
1085
1086 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1087                         const struct sock *sk,
1088                         const struct sk_buff *skb)
1089 {
1090         struct tcp_md5sig_pool *hp;
1091         struct ahash_request *req;
1092         const struct tcphdr *th = tcp_hdr(skb);
1093         __be32 saddr, daddr;
1094
1095         if (sk) { /* valid for establish/request sockets */
1096                 saddr = sk->sk_rcv_saddr;
1097                 daddr = sk->sk_daddr;
1098         } else {
1099                 const struct iphdr *iph = ip_hdr(skb);
1100                 saddr = iph->saddr;
1101                 daddr = iph->daddr;
1102         }
1103
1104         hp = tcp_get_md5sig_pool();
1105         if (!hp)
1106                 goto clear_hash_noput;
1107         req = hp->md5_req;
1108
1109         if (crypto_ahash_init(req))
1110                 goto clear_hash;
1111
1112         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_key(hp, key))
1117                 goto clear_hash;
1118         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1119         if (crypto_ahash_final(req))
1120                 goto clear_hash;
1121
1122         tcp_put_md5sig_pool();
1123         return 0;
1124
1125 clear_hash:
1126         tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128         memset(md5_hash, 0, 16);
1129         return 1;
1130 }
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132
1133 #endif
1134
1135 /* Called with rcu_read_lock() */
1136 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1137                                     const struct sk_buff *skb)
1138 {
1139 #ifdef CONFIG_TCP_MD5SIG
1140         /*
1141          * This gets called for each TCP segment that arrives
1142          * so we want to be efficient.
1143          * We have 3 drop cases:
1144          * o No MD5 hash and one expected.
1145          * o MD5 hash and we're not expecting one.
1146          * o MD5 hash and its wrong.
1147          */
1148         const __u8 *hash_location = NULL;
1149         struct tcp_md5sig_key *hash_expected;
1150         const struct iphdr *iph = ip_hdr(skb);
1151         const struct tcphdr *th = tcp_hdr(skb);
1152         int genhash;
1153         unsigned char newhash[16];
1154
1155         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1156                                           AF_INET);
1157         hash_location = tcp_parse_md5sig_option(th);
1158
1159         /* We've parsed the options - do we have a hash? */
1160         if (!hash_expected && !hash_location)
1161                 return false;
1162
1163         if (hash_expected && !hash_location) {
1164                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165                 return true;
1166         }
1167
1168         if (!hash_expected && hash_location) {
1169                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170                 return true;
1171         }
1172
1173         /* Okay, so this is hash_expected and hash_location -
1174          * so we need to calculate the checksum.
1175          */
1176         genhash = tcp_v4_md5_hash_skb(newhash,
1177                                       hash_expected,
1178                                       NULL, skb);
1179
1180         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1181                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1182                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1183                                      &iph->saddr, ntohs(th->source),
1184                                      &iph->daddr, ntohs(th->dest),
1185                                      genhash ? " tcp_v4_calc_md5_hash failed"
1186                                      : "");
1187                 return true;
1188         }
1189         return false;
1190 #endif
1191         return false;
1192 }
1193
1194 static void tcp_v4_init_req(struct request_sock *req,
1195                             const struct sock *sk_listener,
1196                             struct sk_buff *skb)
1197 {
1198         struct inet_request_sock *ireq = inet_rsk(req);
1199
1200         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1201         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1202         ireq->opt = tcp_v4_save_options(skb);
1203 }
1204
1205 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1206                                           struct flowi *fl,
1207                                           const struct request_sock *req,
1208                                           bool *strict)
1209 {
1210         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1211
1212         if (strict) {
1213                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1214                         *strict = true;
1215                 else
1216                         *strict = false;
1217         }
1218
1219         return dst;
1220 }
1221
1222 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1223         .family         =       PF_INET,
1224         .obj_size       =       sizeof(struct tcp_request_sock),
1225         .rtx_syn_ack    =       tcp_rtx_synack,
1226         .send_ack       =       tcp_v4_reqsk_send_ack,
1227         .destructor     =       tcp_v4_reqsk_destructor,
1228         .send_reset     =       tcp_v4_send_reset,
1229         .syn_ack_timeout =      tcp_syn_ack_timeout,
1230 };
1231
1232 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1233         .mss_clamp      =       TCP_MSS_DEFAULT,
1234 #ifdef CONFIG_TCP_MD5SIG
1235         .req_md5_lookup =       tcp_v4_md5_lookup,
1236         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1237 #endif
1238         .init_req       =       tcp_v4_init_req,
1239 #ifdef CONFIG_SYN_COOKIES
1240         .cookie_init_seq =      cookie_v4_init_sequence,
1241 #endif
1242         .route_req      =       tcp_v4_route_req,
1243         .init_seq       =       tcp_v4_init_sequence,
1244         .send_synack    =       tcp_v4_send_synack,
1245 };
1246
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249         /* Never answer to SYNs send to broadcast or multicast */
1250         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1251                 goto drop;
1252
1253         return tcp_conn_request(&tcp_request_sock_ops,
1254                                 &tcp_request_sock_ipv4_ops, sk, skb);
1255
1256 drop:
1257         tcp_listendrop(sk);
1258         return 0;
1259 }
1260 EXPORT_SYMBOL(tcp_v4_conn_request);
1261
1262
1263 /*
1264  * The three way handshake has completed - we got a valid synack -
1265  * now create the new socket.
1266  */
1267 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1268                                   struct request_sock *req,
1269                                   struct dst_entry *dst,
1270                                   struct request_sock *req_unhash,
1271                                   bool *own_req)
1272 {
1273         struct inet_request_sock *ireq;
1274         struct inet_sock *newinet;
1275         struct tcp_sock *newtp;
1276         struct sock *newsk;
1277 #ifdef CONFIG_TCP_MD5SIG
1278         struct tcp_md5sig_key *key;
1279 #endif
1280         struct ip_options_rcu *inet_opt;
1281
1282         if (sk_acceptq_is_full(sk))
1283                 goto exit_overflow;
1284
1285         newsk = tcp_create_openreq_child(sk, req, skb);
1286         if (!newsk)
1287                 goto exit_nonewsk;
1288
1289         newsk->sk_gso_type = SKB_GSO_TCPV4;
1290         inet_sk_rx_dst_set(newsk, skb);
1291
1292         newtp                 = tcp_sk(newsk);
1293         newinet               = inet_sk(newsk);
1294         ireq                  = inet_rsk(req);
1295         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1296         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1297         newsk->sk_bound_dev_if = ireq->ir_iif;
1298         newinet->inet_saddr           = ireq->ir_loc_addr;
1299         inet_opt              = ireq->opt;
1300         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1301         ireq->opt             = NULL;
1302         newinet->mc_index     = inet_iif(skb);
1303         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1304         newinet->rcv_tos      = ip_hdr(skb)->tos;
1305         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1306         if (inet_opt)
1307                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1308         newinet->inet_id = newtp->write_seq ^ jiffies;
1309
1310         if (!dst) {
1311                 dst = inet_csk_route_child_sock(sk, newsk, req);
1312                 if (!dst)
1313                         goto put_and_exit;
1314         } else {
1315                 /* syncookie case : see end of cookie_v4_check() */
1316         }
1317         sk_setup_caps(newsk, dst);
1318
1319         tcp_ca_openreq_child(newsk, dst);
1320
1321         tcp_sync_mss(newsk, dst_mtu(dst));
1322         newtp->advmss = dst_metric_advmss(dst);
1323         if (tcp_sk(sk)->rx_opt.user_mss &&
1324             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1325                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1326
1327         tcp_initialize_rcv_mss(newsk);
1328
1329 #ifdef CONFIG_TCP_MD5SIG
1330         /* Copy over the MD5 key from the original socket */
1331         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                 AF_INET);
1333         if (key) {
1334                 /*
1335                  * We're using one, so create a matching key
1336                  * on the newsk structure. If we fail to get
1337                  * memory, then we end up not copying the key
1338                  * across. Shucks.
1339                  */
1340                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1341                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1342                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1343         }
1344 #endif
1345
1346         if (__inet_inherit_port(sk, newsk) < 0)
1347                 goto put_and_exit;
1348         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1349         if (*own_req)
1350                 tcp_move_syn(newtp, req);
1351
1352         return newsk;
1353
1354 exit_overflow:
1355         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1356 exit_nonewsk:
1357         dst_release(dst);
1358 exit:
1359         tcp_listendrop(sk);
1360         return NULL;
1361 put_and_exit:
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443
1444 csum_err:
1445         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1520             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1521                 struct sk_buff *skb1;
1522
1523                 BUG_ON(sock_owned_by_user(sk));
1524                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1525                                 skb_queue_len(&tp->ucopy.prequeue));
1526
1527                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1528                         sk_backlog_rcv(sk, skb1);
1529
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542
1543 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1544 {
1545         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1546
1547         /* Only socket owner can try to collapse/prune rx queues
1548          * to reduce memory overhead, so add a little headroom here.
1549          * Few sockets backlog are possibly concurrently non empty.
1550          */
1551         limit += 64*1024;
1552
1553         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1554          * we can fix skb->truesize to its real value to avoid future drops.
1555          * This is valid because skb is not yet charged to the socket.
1556          * It has been noticed pure SACK packets were sometimes dropped
1557          * (if cooked by drivers without copybreak feature).
1558          */
1559         if (!skb->data_len)
1560                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1561
1562         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1563                 bh_unlock_sock(sk);
1564                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1565                 return true;
1566         }
1567         return false;
1568 }
1569 EXPORT_SYMBOL(tcp_add_backlog);
1570
1571 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1572 {
1573         struct tcphdr *th = (struct tcphdr *)skb->data;
1574         unsigned int eaten = skb->len;
1575         int err;
1576
1577         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1578         if (!err) {
1579                 eaten -= skb->len;
1580                 TCP_SKB_CB(skb)->end_seq -= eaten;
1581         }
1582         return err;
1583 }
1584 EXPORT_SYMBOL(tcp_filter);
1585
1586 /*
1587  *      From tcp_input.c
1588  */
1589
1590 int tcp_v4_rcv(struct sk_buff *skb)
1591 {
1592         struct net *net = dev_net(skb->dev);
1593         const struct iphdr *iph;
1594         const struct tcphdr *th;
1595         bool refcounted;
1596         struct sock *sk;
1597         int ret;
1598
1599         if (skb->pkt_type != PACKET_HOST)
1600                 goto discard_it;
1601
1602         /* Count it even if it's bad */
1603         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1604
1605         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1606                 goto discard_it;
1607
1608         th = (const struct tcphdr *)skb->data;
1609
1610         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1611                 goto bad_packet;
1612         if (!pskb_may_pull(skb, th->doff * 4))
1613                 goto discard_it;
1614
1615         /* An explanation is required here, I think.
1616          * Packet length and doff are validated by header prediction,
1617          * provided case of th->doff==0 is eliminated.
1618          * So, we defer the checks. */
1619
1620         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1621                 goto csum_error;
1622
1623         th = (const struct tcphdr *)skb->data;
1624         iph = ip_hdr(skb);
1625         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1626          * barrier() makes sure compiler wont play fool^Waliasing games.
1627          */
1628         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1629                 sizeof(struct inet_skb_parm));
1630         barrier();
1631
1632         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1633         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1634                                     skb->len - th->doff * 4);
1635         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1636         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1637         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1638         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1639         TCP_SKB_CB(skb)->sacked  = 0;
1640
1641 lookup:
1642         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1643                                th->dest, &refcounted);
1644         if (!sk)
1645                 goto no_tcp_socket;
1646
1647 process:
1648         if (sk->sk_state == TCP_TIME_WAIT)
1649                 goto do_time_wait;
1650
1651         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1652                 struct request_sock *req = inet_reqsk(sk);
1653                 struct sock *nsk;
1654
1655                 sk = req->rsk_listener;
1656                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1657                         sk_drops_add(sk, skb);
1658                         reqsk_put(req);
1659                         goto discard_it;
1660                 }
1661                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1662                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1663                         goto lookup;
1664                 }
1665                 /* We own a reference on the listener, increase it again
1666                  * as we might lose it too soon.
1667                  */
1668                 sock_hold(sk);
1669                 refcounted = true;
1670                 nsk = tcp_check_req(sk, skb, req, false);
1671                 if (!nsk) {
1672                         reqsk_put(req);
1673                         goto discard_and_relse;
1674                 }
1675                 if (nsk == sk) {
1676                         reqsk_put(req);
1677                 } else if (tcp_child_process(sk, nsk, skb)) {
1678                         tcp_v4_send_reset(nsk, skb);
1679                         goto discard_and_relse;
1680                 } else {
1681                         sock_put(sk);
1682                         return 0;
1683                 }
1684         }
1685         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1686                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1687                 goto discard_and_relse;
1688         }
1689
1690         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1691                 goto discard_and_relse;
1692
1693         if (tcp_v4_inbound_md5_hash(sk, skb))
1694                 goto discard_and_relse;
1695
1696         nf_reset(skb);
1697
1698         if (tcp_filter(sk, skb))
1699                 goto discard_and_relse;
1700         th = (const struct tcphdr *)skb->data;
1701         iph = ip_hdr(skb);
1702
1703         skb->dev = NULL;
1704
1705         if (sk->sk_state == TCP_LISTEN) {
1706                 ret = tcp_v4_do_rcv(sk, skb);
1707                 goto put_and_return;
1708         }
1709
1710         sk_incoming_cpu_update(sk);
1711
1712         bh_lock_sock_nested(sk);
1713         tcp_segs_in(tcp_sk(sk), skb);
1714         ret = 0;
1715         if (!sock_owned_by_user(sk)) {
1716                 if (!tcp_prequeue(sk, skb))
1717                         ret = tcp_v4_do_rcv(sk, skb);
1718         } else if (tcp_add_backlog(sk, skb)) {
1719                 goto discard_and_relse;
1720         }
1721         bh_unlock_sock(sk);
1722
1723 put_and_return:
1724         if (refcounted)
1725                 sock_put(sk);
1726
1727         return ret;
1728
1729 no_tcp_socket:
1730         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1731                 goto discard_it;
1732
1733         if (tcp_checksum_complete(skb)) {
1734 csum_error:
1735                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1736 bad_packet:
1737                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1738         } else {
1739                 tcp_v4_send_reset(NULL, skb);
1740         }
1741
1742 discard_it:
1743         /* Discard frame. */
1744         kfree_skb(skb);
1745         return 0;
1746
1747 discard_and_relse:
1748         sk_drops_add(sk, skb);
1749         if (refcounted)
1750                 sock_put(sk);
1751         goto discard_it;
1752
1753 do_time_wait:
1754         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1755                 inet_twsk_put(inet_twsk(sk));
1756                 goto discard_it;
1757         }
1758
1759         if (tcp_checksum_complete(skb)) {
1760                 inet_twsk_put(inet_twsk(sk));
1761                 goto csum_error;
1762         }
1763         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1764         case TCP_TW_SYN: {
1765                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1766                                                         &tcp_hashinfo, skb,
1767                                                         __tcp_hdrlen(th),
1768                                                         iph->saddr, th->source,
1769                                                         iph->daddr, th->dest,
1770                                                         inet_iif(skb));
1771                 if (sk2) {
1772                         inet_twsk_deschedule_put(inet_twsk(sk));
1773                         sk = sk2;
1774                         refcounted = false;
1775                         goto process;
1776                 }
1777                 /* Fall through to ACK */
1778         }
1779         case TCP_TW_ACK:
1780                 tcp_v4_timewait_ack(sk, skb);
1781                 break;
1782         case TCP_TW_RST:
1783                 tcp_v4_send_reset(sk, skb);
1784                 inet_twsk_deschedule_put(inet_twsk(sk));
1785                 goto discard_it;
1786         case TCP_TW_SUCCESS:;
1787         }
1788         goto discard_it;
1789 }
1790
1791 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1792         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1793         .twsk_unique    = tcp_twsk_unique,
1794         .twsk_destructor= tcp_twsk_destructor,
1795 };
1796
1797 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1798 {
1799         struct dst_entry *dst = skb_dst(skb);
1800
1801         if (dst && dst_hold_safe(dst)) {
1802                 sk->sk_rx_dst = dst;
1803                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1804         }
1805 }
1806 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1807
1808 const struct inet_connection_sock_af_ops ipv4_specific = {
1809         .queue_xmit        = ip_queue_xmit,
1810         .send_check        = tcp_v4_send_check,
1811         .rebuild_header    = inet_sk_rebuild_header,
1812         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1813         .conn_request      = tcp_v4_conn_request,
1814         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1815         .net_header_len    = sizeof(struct iphdr),
1816         .setsockopt        = ip_setsockopt,
1817         .getsockopt        = ip_getsockopt,
1818         .addr2sockaddr     = inet_csk_addr2sockaddr,
1819         .sockaddr_len      = sizeof(struct sockaddr_in),
1820 #ifdef CONFIG_COMPAT
1821         .compat_setsockopt = compat_ip_setsockopt,
1822         .compat_getsockopt = compat_ip_getsockopt,
1823 #endif
1824         .mtu_reduced       = tcp_v4_mtu_reduced,
1825 };
1826 EXPORT_SYMBOL(ipv4_specific);
1827
1828 #ifdef CONFIG_TCP_MD5SIG
1829 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1830         .md5_lookup             = tcp_v4_md5_lookup,
1831         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1832         .md5_parse              = tcp_v4_parse_md5_keys,
1833 };
1834 #endif
1835
1836 /* NOTE: A lot of things set to zero explicitly by call to
1837  *       sk_alloc() so need not be done here.
1838  */
1839 static int tcp_v4_init_sock(struct sock *sk)
1840 {
1841         struct inet_connection_sock *icsk = inet_csk(sk);
1842
1843         tcp_init_sock(sk);
1844
1845         icsk->icsk_af_ops = &ipv4_specific;
1846
1847 #ifdef CONFIG_TCP_MD5SIG
1848         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1849 #endif
1850
1851         return 0;
1852 }
1853
1854 void tcp_v4_destroy_sock(struct sock *sk)
1855 {
1856         struct tcp_sock *tp = tcp_sk(sk);
1857
1858         tcp_clear_xmit_timers(sk);
1859
1860         tcp_cleanup_congestion_control(sk);
1861
1862         /* Cleanup up the write buffer. */
1863         tcp_write_queue_purge(sk);
1864
1865         /* Cleans up our, hopefully empty, out_of_order_queue. */
1866         skb_rbtree_purge(&tp->out_of_order_queue);
1867
1868 #ifdef CONFIG_TCP_MD5SIG
1869         /* Clean up the MD5 key list, if any */
1870         if (tp->md5sig_info) {
1871                 tcp_clear_md5_list(sk);
1872                 kfree_rcu(tp->md5sig_info, rcu);
1873                 tp->md5sig_info = NULL;
1874         }
1875 #endif
1876
1877         /* Clean prequeue, it must be empty really */
1878         __skb_queue_purge(&tp->ucopy.prequeue);
1879
1880         /* Clean up a referenced TCP bind bucket. */
1881         if (inet_csk(sk)->icsk_bind_hash)
1882                 inet_put_port(sk);
1883
1884         BUG_ON(tp->fastopen_rsk);
1885
1886         /* If socket is aborted during connect operation */
1887         tcp_free_fastopen_req(tp);
1888         tcp_saved_syn_free(tp);
1889
1890         sk_sockets_allocated_dec(sk);
1891 }
1892 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1893
1894 #ifdef CONFIG_PROC_FS
1895 /* Proc filesystem TCP sock list dumping. */
1896
1897 /*
1898  * Get next listener socket follow cur.  If cur is NULL, get first socket
1899  * starting from bucket given in st->bucket; when st->bucket is zero the
1900  * very first socket in the hash table is returned.
1901  */
1902 static void *listening_get_next(struct seq_file *seq, void *cur)
1903 {
1904         struct tcp_iter_state *st = seq->private;
1905         struct net *net = seq_file_net(seq);
1906         struct inet_listen_hashbucket *ilb;
1907         struct sock *sk = cur;
1908
1909         if (!sk) {
1910 get_head:
1911                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1912                 spin_lock(&ilb->lock);
1913                 sk = sk_head(&ilb->head);
1914                 st->offset = 0;
1915                 goto get_sk;
1916         }
1917         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1918         ++st->num;
1919         ++st->offset;
1920
1921         sk = sk_next(sk);
1922 get_sk:
1923         sk_for_each_from(sk) {
1924                 if (!net_eq(sock_net(sk), net))
1925                         continue;
1926                 if (sk->sk_family == st->family)
1927                         return sk;
1928         }
1929         spin_unlock(&ilb->lock);
1930         st->offset = 0;
1931         if (++st->bucket < INET_LHTABLE_SIZE)
1932                 goto get_head;
1933         return NULL;
1934 }
1935
1936 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1937 {
1938         struct tcp_iter_state *st = seq->private;
1939         void *rc;
1940
1941         st->bucket = 0;
1942         st->offset = 0;
1943         rc = listening_get_next(seq, NULL);
1944
1945         while (rc && *pos) {
1946                 rc = listening_get_next(seq, rc);
1947                 --*pos;
1948         }
1949         return rc;
1950 }
1951
1952 static inline bool empty_bucket(const struct tcp_iter_state *st)
1953 {
1954         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1955 }
1956
1957 /*
1958  * Get first established socket starting from bucket given in st->bucket.
1959  * If st->bucket is zero, the very first socket in the hash is returned.
1960  */
1961 static void *established_get_first(struct seq_file *seq)
1962 {
1963         struct tcp_iter_state *st = seq->private;
1964         struct net *net = seq_file_net(seq);
1965         void *rc = NULL;
1966
1967         st->offset = 0;
1968         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1969                 struct sock *sk;
1970                 struct hlist_nulls_node *node;
1971                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1972
1973                 /* Lockless fast path for the common case of empty buckets */
1974                 if (empty_bucket(st))
1975                         continue;
1976
1977                 spin_lock_bh(lock);
1978                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1979                         if (sk->sk_family != st->family ||
1980                             !net_eq(sock_net(sk), net)) {
1981                                 continue;
1982                         }
1983                         rc = sk;
1984                         goto out;
1985                 }
1986                 spin_unlock_bh(lock);
1987         }
1988 out:
1989         return rc;
1990 }
1991
1992 static void *established_get_next(struct seq_file *seq, void *cur)
1993 {
1994         struct sock *sk = cur;
1995         struct hlist_nulls_node *node;
1996         struct tcp_iter_state *st = seq->private;
1997         struct net *net = seq_file_net(seq);
1998
1999         ++st->num;
2000         ++st->offset;
2001
2002         sk = sk_nulls_next(sk);
2003
2004         sk_nulls_for_each_from(sk, node) {
2005                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2006                         return sk;
2007         }
2008
2009         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2010         ++st->bucket;
2011         return established_get_first(seq);
2012 }
2013
2014 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2015 {
2016         struct tcp_iter_state *st = seq->private;
2017         void *rc;
2018
2019         st->bucket = 0;
2020         rc = established_get_first(seq);
2021
2022         while (rc && pos) {
2023                 rc = established_get_next(seq, rc);
2024                 --pos;
2025         }
2026         return rc;
2027 }
2028
2029 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2030 {
2031         void *rc;
2032         struct tcp_iter_state *st = seq->private;
2033
2034         st->state = TCP_SEQ_STATE_LISTENING;
2035         rc        = listening_get_idx(seq, &pos);
2036
2037         if (!rc) {
2038                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2039                 rc        = established_get_idx(seq, pos);
2040         }
2041
2042         return rc;
2043 }
2044
2045 static void *tcp_seek_last_pos(struct seq_file *seq)
2046 {
2047         struct tcp_iter_state *st = seq->private;
2048         int offset = st->offset;
2049         int orig_num = st->num;
2050         void *rc = NULL;
2051
2052         switch (st->state) {
2053         case TCP_SEQ_STATE_LISTENING:
2054                 if (st->bucket >= INET_LHTABLE_SIZE)
2055                         break;
2056                 st->state = TCP_SEQ_STATE_LISTENING;
2057                 rc = listening_get_next(seq, NULL);
2058                 while (offset-- && rc)
2059                         rc = listening_get_next(seq, rc);
2060                 if (rc)
2061                         break;
2062                 st->bucket = 0;
2063                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2064                 /* Fallthrough */
2065         case TCP_SEQ_STATE_ESTABLISHED:
2066                 if (st->bucket > tcp_hashinfo.ehash_mask)
2067                         break;
2068                 rc = established_get_first(seq);
2069                 while (offset-- && rc)
2070                         rc = established_get_next(seq, rc);
2071         }
2072
2073         st->num = orig_num;
2074
2075         return rc;
2076 }
2077
2078 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2079 {
2080         struct tcp_iter_state *st = seq->private;
2081         void *rc;
2082
2083         if (*pos && *pos == st->last_pos) {
2084                 rc = tcp_seek_last_pos(seq);
2085                 if (rc)
2086                         goto out;
2087         }
2088
2089         st->state = TCP_SEQ_STATE_LISTENING;
2090         st->num = 0;
2091         st->bucket = 0;
2092         st->offset = 0;
2093         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2094
2095 out:
2096         st->last_pos = *pos;
2097         return rc;
2098 }
2099
2100 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2101 {
2102         struct tcp_iter_state *st = seq->private;
2103         void *rc = NULL;
2104
2105         if (v == SEQ_START_TOKEN) {
2106                 rc = tcp_get_idx(seq, 0);
2107                 goto out;
2108         }
2109
2110         switch (st->state) {
2111         case TCP_SEQ_STATE_LISTENING:
2112                 rc = listening_get_next(seq, v);
2113                 if (!rc) {
2114                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2115                         st->bucket = 0;
2116                         st->offset = 0;
2117                         rc        = established_get_first(seq);
2118                 }
2119                 break;
2120         case TCP_SEQ_STATE_ESTABLISHED:
2121                 rc = established_get_next(seq, v);
2122                 break;
2123         }
2124 out:
2125         ++*pos;
2126         st->last_pos = *pos;
2127         return rc;
2128 }
2129
2130 static void tcp_seq_stop(struct seq_file *seq, void *v)
2131 {
2132         struct tcp_iter_state *st = seq->private;
2133
2134         switch (st->state) {
2135         case TCP_SEQ_STATE_LISTENING:
2136                 if (v != SEQ_START_TOKEN)
2137                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2138                 break;
2139         case TCP_SEQ_STATE_ESTABLISHED:
2140                 if (v)
2141                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2142                 break;
2143         }
2144 }
2145
2146 int tcp_seq_open(struct inode *inode, struct file *file)
2147 {
2148         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2149         struct tcp_iter_state *s;
2150         int err;
2151
2152         err = seq_open_net(inode, file, &afinfo->seq_ops,
2153                           sizeof(struct tcp_iter_state));
2154         if (err < 0)
2155                 return err;
2156
2157         s = ((struct seq_file *)file->private_data)->private;
2158         s->family               = afinfo->family;
2159         s->last_pos             = 0;
2160         return 0;
2161 }
2162 EXPORT_SYMBOL(tcp_seq_open);
2163
2164 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2165 {
2166         int rc = 0;
2167         struct proc_dir_entry *p;
2168
2169         afinfo->seq_ops.start           = tcp_seq_start;
2170         afinfo->seq_ops.next            = tcp_seq_next;
2171         afinfo->seq_ops.stop            = tcp_seq_stop;
2172
2173         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2174                              afinfo->seq_fops, afinfo);
2175         if (!p)
2176                 rc = -ENOMEM;
2177         return rc;
2178 }
2179 EXPORT_SYMBOL(tcp_proc_register);
2180
2181 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2182 {
2183         remove_proc_entry(afinfo->name, net->proc_net);
2184 }
2185 EXPORT_SYMBOL(tcp_proc_unregister);
2186
2187 static void get_openreq4(const struct request_sock *req,
2188                          struct seq_file *f, int i)
2189 {
2190         const struct inet_request_sock *ireq = inet_rsk(req);
2191         long delta = req->rsk_timer.expires - jiffies;
2192
2193         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2194                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2195                 i,
2196                 ireq->ir_loc_addr,
2197                 ireq->ir_num,
2198                 ireq->ir_rmt_addr,
2199                 ntohs(ireq->ir_rmt_port),
2200                 TCP_SYN_RECV,
2201                 0, 0, /* could print option size, but that is af dependent. */
2202                 1,    /* timers active (only the expire timer) */
2203                 jiffies_delta_to_clock_t(delta),
2204                 req->num_timeout,
2205                 from_kuid_munged(seq_user_ns(f),
2206                                  sock_i_uid(req->rsk_listener)),
2207                 0,  /* non standard timer */
2208                 0, /* open_requests have no inode */
2209                 0,
2210                 req);
2211 }
2212
2213 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2214 {
2215         int timer_active;
2216         unsigned long timer_expires;
2217         const struct tcp_sock *tp = tcp_sk(sk);
2218         const struct inet_connection_sock *icsk = inet_csk(sk);
2219         const struct inet_sock *inet = inet_sk(sk);
2220         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2221         __be32 dest = inet->inet_daddr;
2222         __be32 src = inet->inet_rcv_saddr;
2223         __u16 destp = ntohs(inet->inet_dport);
2224         __u16 srcp = ntohs(inet->inet_sport);
2225         int rx_queue;
2226         int state;
2227
2228         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2229             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2230             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2231                 timer_active    = 1;
2232                 timer_expires   = icsk->icsk_timeout;
2233         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2234                 timer_active    = 4;
2235                 timer_expires   = icsk->icsk_timeout;
2236         } else if (timer_pending(&sk->sk_timer)) {
2237                 timer_active    = 2;
2238                 timer_expires   = sk->sk_timer.expires;
2239         } else {
2240                 timer_active    = 0;
2241                 timer_expires = jiffies;
2242         }
2243
2244         state = sk_state_load(sk);
2245         if (state == TCP_LISTEN)
2246                 rx_queue = sk->sk_ack_backlog;
2247         else
2248                 /* Because we don't lock the socket,
2249                  * we might find a transient negative value.
2250                  */
2251                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2252
2253         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2254                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2255                 i, src, srcp, dest, destp, state,
2256                 tp->write_seq - tp->snd_una,
2257                 rx_queue,
2258                 timer_active,
2259                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2260                 icsk->icsk_retransmits,
2261                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2262                 icsk->icsk_probes_out,
2263                 sock_i_ino(sk),
2264                 atomic_read(&sk->sk_refcnt), sk,
2265                 jiffies_to_clock_t(icsk->icsk_rto),
2266                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2267                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2268                 tp->snd_cwnd,
2269                 state == TCP_LISTEN ?
2270                     fastopenq->max_qlen :
2271                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2272 }
2273
2274 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2275                                struct seq_file *f, int i)
2276 {
2277         long delta = tw->tw_timer.expires - jiffies;
2278         __be32 dest, src;
2279         __u16 destp, srcp;
2280
2281         dest  = tw->tw_daddr;
2282         src   = tw->tw_rcv_saddr;
2283         destp = ntohs(tw->tw_dport);
2284         srcp  = ntohs(tw->tw_sport);
2285
2286         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2287                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2288                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2289                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2290                 atomic_read(&tw->tw_refcnt), tw);
2291 }
2292
2293 #define TMPSZ 150
2294
2295 static int tcp4_seq_show(struct seq_file *seq, void *v)
2296 {
2297         struct tcp_iter_state *st;
2298         struct sock *sk = v;
2299
2300         seq_setwidth(seq, TMPSZ - 1);
2301         if (v == SEQ_START_TOKEN) {
2302                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2303                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2304                            "inode");
2305                 goto out;
2306         }
2307         st = seq->private;
2308
2309         if (sk->sk_state == TCP_TIME_WAIT)
2310                 get_timewait4_sock(v, seq, st->num);
2311         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2312                 get_openreq4(v, seq, st->num);
2313         else
2314                 get_tcp4_sock(v, seq, st->num);
2315 out:
2316         seq_pad(seq, '\n');
2317         return 0;
2318 }
2319
2320 static const struct file_operations tcp_afinfo_seq_fops = {
2321         .owner   = THIS_MODULE,
2322         .open    = tcp_seq_open,
2323         .read    = seq_read,
2324         .llseek  = seq_lseek,
2325         .release = seq_release_net
2326 };
2327
2328 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2329         .name           = "tcp",
2330         .family         = AF_INET,
2331         .seq_fops       = &tcp_afinfo_seq_fops,
2332         .seq_ops        = {
2333                 .show           = tcp4_seq_show,
2334         },
2335 };
2336
2337 static int __net_init tcp4_proc_init_net(struct net *net)
2338 {
2339         return tcp_proc_register(net, &tcp4_seq_afinfo);
2340 }
2341
2342 static void __net_exit tcp4_proc_exit_net(struct net *net)
2343 {
2344         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2345 }
2346
2347 static struct pernet_operations tcp4_net_ops = {
2348         .init = tcp4_proc_init_net,
2349         .exit = tcp4_proc_exit_net,
2350 };
2351
2352 int __init tcp4_proc_init(void)
2353 {
2354         return register_pernet_subsys(&tcp4_net_ops);
2355 }
2356
2357 void tcp4_proc_exit(void)
2358 {
2359         unregister_pernet_subsys(&tcp4_net_ops);
2360 }
2361 #endif /* CONFIG_PROC_FS */
2362
2363 struct proto tcp_prot = {
2364         .name                   = "TCP",
2365         .owner                  = THIS_MODULE,
2366         .close                  = tcp_close,
2367         .connect                = tcp_v4_connect,
2368         .disconnect             = tcp_disconnect,
2369         .accept                 = inet_csk_accept,
2370         .ioctl                  = tcp_ioctl,
2371         .init                   = tcp_v4_init_sock,
2372         .destroy                = tcp_v4_destroy_sock,
2373         .shutdown               = tcp_shutdown,
2374         .setsockopt             = tcp_setsockopt,
2375         .getsockopt             = tcp_getsockopt,
2376         .keepalive              = tcp_set_keepalive,
2377         .recvmsg                = tcp_recvmsg,
2378         .sendmsg                = tcp_sendmsg,
2379         .sendpage               = tcp_sendpage,
2380         .backlog_rcv            = tcp_v4_do_rcv,
2381         .release_cb             = tcp_release_cb,
2382         .hash                   = inet_hash,
2383         .unhash                 = inet_unhash,
2384         .get_port               = inet_csk_get_port,
2385         .enter_memory_pressure  = tcp_enter_memory_pressure,
2386         .stream_memory_free     = tcp_stream_memory_free,
2387         .sockets_allocated      = &tcp_sockets_allocated,
2388         .orphan_count           = &tcp_orphan_count,
2389         .memory_allocated       = &tcp_memory_allocated,
2390         .memory_pressure        = &tcp_memory_pressure,
2391         .sysctl_mem             = sysctl_tcp_mem,
2392         .sysctl_wmem            = sysctl_tcp_wmem,
2393         .sysctl_rmem            = sysctl_tcp_rmem,
2394         .max_header             = MAX_TCP_HEADER,
2395         .obj_size               = sizeof(struct tcp_sock),
2396         .slab_flags             = SLAB_DESTROY_BY_RCU,
2397         .twsk_prot              = &tcp_timewait_sock_ops,
2398         .rsk_prot               = &tcp_request_sock_ops,
2399         .h.hashinfo             = &tcp_hashinfo,
2400         .no_autobind            = true,
2401 #ifdef CONFIG_COMPAT
2402         .compat_setsockopt      = compat_tcp_setsockopt,
2403         .compat_getsockopt      = compat_tcp_getsockopt,
2404 #endif
2405         .diag_destroy           = tcp_abort,
2406 };
2407 EXPORT_SYMBOL(tcp_prot);
2408
2409 static void __net_exit tcp_sk_exit(struct net *net)
2410 {
2411         int cpu;
2412
2413         for_each_possible_cpu(cpu)
2414                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2415         free_percpu(net->ipv4.tcp_sk);
2416 }
2417
2418 static int __net_init tcp_sk_init(struct net *net)
2419 {
2420         int res, cpu, cnt;
2421
2422         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2423         if (!net->ipv4.tcp_sk)
2424                 return -ENOMEM;
2425
2426         for_each_possible_cpu(cpu) {
2427                 struct sock *sk;
2428
2429                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2430                                            IPPROTO_TCP, net);
2431                 if (res)
2432                         goto fail;
2433                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2434                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2435         }
2436
2437         net->ipv4.sysctl_tcp_ecn = 2;
2438         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2439
2440         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2441         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2442         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2443
2444         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2445         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2446         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2447
2448         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2449         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2450         net->ipv4.sysctl_tcp_syncookies = 1;
2451         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2452         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2453         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2454         net->ipv4.sysctl_tcp_orphan_retries = 0;
2455         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2456         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2457         net->ipv4.sysctl_tcp_tw_reuse = 0;
2458
2459         cnt = tcp_hashinfo.ehash_mask + 1;
2460         net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2461         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2462         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2463
2464         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2465
2466         return 0;
2467 fail:
2468         tcp_sk_exit(net);
2469
2470         return res;
2471 }
2472
2473 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2474 {
2475         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2476 }
2477
2478 static struct pernet_operations __net_initdata tcp_sk_ops = {
2479        .init       = tcp_sk_init,
2480        .exit       = tcp_sk_exit,
2481        .exit_batch = tcp_sk_exit_batch,
2482 };
2483
2484 void __init tcp_v4_init(void)
2485 {
2486         if (register_pernet_subsys(&tcp_sk_ops))
2487                 panic("Failed to create the TCP control socket.\n");
2488 }