net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine does path mtu discovery as defined in RFC1191.
 279  */
 280 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 281 {
 282         struct dst_entry *dst;
 283         struct inet_sock *inet = inet_sk(sk);
 284
 285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 286          * send out by Linux are always <576bytes so they should go through
 287          * unfragmented).
 288          */
 289         if (sk->sk_state == TCP_LISTEN)
 290                 return;
 291
 292         dst = inet_csk_update_pmtu(sk, mtu);
 293         if (!dst)
 294                 return;
 295
 296         /* Something is about to be wrong... Remember soft error
 297          * for the case, if this connection will not able to recover.
 298          */
 299         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 300                 sk->sk_err_soft = EMSGSIZE;
 301
 302         mtu = dst_mtu(dst);
 303
 304         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 305             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                 tcp_sync_mss(sk, mtu);
 307
 308                 /* Resend the TCP packet because it's
 309                  * clear that the old packet has been
 310                  * dropped. This is the new "fast" path mtu
 311                  * discovery.
 312                  */
 313                 tcp_simple_retransmit(sk);
 314         } /* else let the usual retransmit timer handle it */
 315 }
 316
 317 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 318 {
 319         struct dst_entry *dst = __sk_dst_check(sk, 0);
 320
 321         if (dst)
 322                 dst->ops->redirect(dst, sk, skb);
 323 }
 324
 325 /*
 326  * This routine is called by the ICMP module when it gets some
 327  * sort of error condition.  If err < 0 then the socket should
 328  * be closed and the error returned to the user.  If err > 0
 329  * it's just the icmp type << 8 | icmp code.  After adjustment
 330  * header points to the first 8 bytes of the tcp header.  We need
 331  * to find the appropriate port.
 332  *
 333  * The locking strategy used here is very "optimistic". When
 334  * someone else accesses the socket the ICMP is just dropped
 335  * and for some paths there is no check at all.
 336  * A more general error queue to queue errors for later handling
 337  * is probably better.
 338  *
 339  */
 340
 341 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 342 {
 343         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 344         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 345         struct inet_connection_sock *icsk;
 346         struct tcp_sock *tp;
 347         struct inet_sock *inet;
 348         const int type = icmp_hdr(icmp_skb)->type;
 349         const int code = icmp_hdr(icmp_skb)->code;
 350         struct sock *sk;
 351         struct sk_buff *skb;
 352         __u32 seq;
 353         __u32 remaining;
 354         int err;
 355         struct net *net = dev_net(icmp_skb->dev);
 356
 357         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 359                 return;
 360         }
 361
 362         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 363                         iph->saddr, th->source, inet_iif(icmp_skb));
 364         if (!sk) {
 365                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 366                 return;
 367         }
 368         if (sk->sk_state == TCP_TIME_WAIT) {
 369                 inet_twsk_put(inet_twsk(sk));
 370                 return;
 371         }
 372
 373         bh_lock_sock(sk);
 374         /* If too many ICMPs get dropped on busy
 375          * servers this needs to be solved differently.
 376          */
 377         if (sock_owned_by_user(sk))
 378                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 379
 380         if (sk->sk_state == TCP_CLOSE)
 381                 goto out;
 382
 383         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 384                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 385                 goto out;
 386         }
 387
 388         icsk = inet_csk(sk);
 389         tp = tcp_sk(sk);
 390         seq = ntohl(th->seq);
 391         if (sk->sk_state != TCP_LISTEN &&
 392             !between(seq, tp->snd_una, tp->snd_nxt)) {
 393                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394                 goto out;
 395         }
 396
 397         switch (type) {
 398         case ICMP_REDIRECT:
 399                 do_redirect(icmp_skb, sk);
 400                 goto out;
 401         case ICMP_SOURCE_QUENCH:
 402                 /* Just silently ignore these. */
 403                 goto out;
 404         case ICMP_PARAMETERPROB:
 405                 err = EPROTO;
 406                 break;
 407         case ICMP_DEST_UNREACH:
 408                 if (code > NR_ICMP_UNREACH)
 409                         goto out;
 410
 411                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 412                         if (!sock_owned_by_user(sk))
 413                                 do_pmtu_discovery(sk, iph, info);
 414                         goto out;
 415                 }
 416
 417                 err = icmp_err_convert[code].errno;
 418                 /* check if icmp_skb allows revert of backoff
 419                  * (see draft-zimmermann-tcp-lcd) */
 420                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 421                         break;
 422                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 423                     !icsk->icsk_backoff)
 424                         break;
 425
 426                 if (sock_owned_by_user(sk))
 427                         break;
 428
 429                 icsk->icsk_backoff--;
 430                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 431                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 432                 tcp_bound_rto(sk);
 433
 434                 skb = tcp_write_queue_head(sk);
 435                 BUG_ON(!skb);
 436
 437                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 438                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 439
 440                 if (remaining) {
 441                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 442                                                   remaining, TCP_RTO_MAX);
 443                 } else {
 444                         /* RTO revert clocked out retransmission.
 445                          * Will retransmit now */
 446                         tcp_retransmit_timer(sk);
 447                 }
 448
 449                 break;
 450         case ICMP_TIME_EXCEEDED:
 451                 err = EHOSTUNREACH;
 452                 break;
 453         default:
 454                 goto out;
 455         }
 456
 457         switch (sk->sk_state) {
 458                 struct request_sock *req, **prev;
 459         case TCP_LISTEN:
 460                 if (sock_owned_by_user(sk))
 461                         goto out;
 462
 463                 req = inet_csk_search_req(sk, &prev, th->dest,
 464                                           iph->daddr, iph->saddr);
 465                 if (!req)
 466                         goto out;
 467
 468                 /* ICMPs are not backlogged, hence we cannot get
 469                    an established socket here.
 470                  */
 471                 WARN_ON(req->sk);
 472
 473                 if (seq != tcp_rsk(req)->snt_isn) {
 474                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 475                         goto out;
 476                 }
 477
 478                 /*
 479                  * Still in SYN_RECV, just remove it silently.
 480                  * There is no good way to pass the error to the newly
 481                  * created socket, and POSIX does not want network
 482                  * errors returned from accept().
 483                  */
 484                 inet_csk_reqsk_queue_drop(sk, req, prev);
 485                 goto out;
 486
 487         case TCP_SYN_SENT:
 488         case TCP_SYN_RECV:  /* Cannot happen.
 489                                It can f.e. if SYNs crossed.
 490                              */
 491                 if (!sock_owned_by_user(sk)) {
 492                         sk->sk_err = err;
 493
 494                         sk->sk_error_report(sk);
 495
 496                         tcp_done(sk);
 497                 } else {
 498                         sk->sk_err_soft = err;
 499                 }
 500                 goto out;
 501         }
 502
 503         /* If we've already connected we will keep trying
 504          * until we time out, or the user gives up.
 505          *
 506          * rfc1122 4.2.3.9 allows to consider as hard errors
 507          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 508          * but it is obsoleted by pmtu discovery).
 509          *
 510          * Note, that in modern internet, where routing is unreliable
 511          * and in each dark corner broken firewalls sit, sending random
 512          * errors ordered by their masters even this two messages finally lose
 513          * their original sense (even Linux sends invalid PORT_UNREACHs)
 514          *
 515          * Now we are in compliance with RFCs.
 516          *                                                      --ANK (980905)
 517          */
 518
 519         inet = inet_sk(sk);
 520         if (!sock_owned_by_user(sk) && inet->recverr) {
 521                 sk->sk_err = err;
 522                 sk->sk_error_report(sk);
 523         } else  { /* Only an error on timeout */
 524                 sk->sk_err_soft = err;
 525         }
 526
 527 out:
 528         bh_unlock_sock(sk);
 529         sock_put(sk);
 530 }
 531
 532 static void __tcp_v4_send_check(struct sk_buff *skb,
 533                                 __be32 saddr, __be32 daddr)
 534 {
 535         struct tcphdr *th = tcp_hdr(skb);
 536
 537         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 538                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 539                 skb->csum_start = skb_transport_header(skb) - skb->head;
 540                 skb->csum_offset = offsetof(struct tcphdr, check);
 541         } else {
 542                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 543                                          csum_partial(th,
 544                                                       th->doff << 2,
 545                                                       skb->csum));
 546         }
 547 }
 548
 549 /* This routine computes an IPv4 TCP checksum. */
 550 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 551 {
 552         const struct inet_sock *inet = inet_sk(sk);
 553
 554         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 555 }
 556 EXPORT_SYMBOL(tcp_v4_send_check);
 557
 558 int tcp_v4_gso_send_check(struct sk_buff *skb)
 559 {
 560         const struct iphdr *iph;
 561         struct tcphdr *th;
 562
 563         if (!pskb_may_pull(skb, sizeof(*th)))
 564                 return -EINVAL;
 565
 566         iph = ip_hdr(skb);
 567         th = tcp_hdr(skb);
 568
 569         th->check = 0;
 570         skb->ip_summed = CHECKSUM_PARTIAL;
 571         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 572         return 0;
 573 }
 574
 575 /*
 576  *      This routine will send an RST to the other tcp.
 577  *
 578  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 579  *                    for reset.
 580  *      Answer: if a packet caused RST, it is not for a socket
 581  *              existing in our system, if it is matched to a socket,
 582  *              it is just duplicate segment or bug in other side's TCP.
 583  *              So that we build reply only basing on parameters
 584  *              arrived with segment.
 585  *      Exception: precedence violation. We do not implement it in any case.
 586  */
 587
 588 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 589 {
 590         const struct tcphdr *th = tcp_hdr(skb);
 591         struct {
 592                 struct tcphdr th;
 593 #ifdef CONFIG_TCP_MD5SIG
 594                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 595 #endif
 596         } rep;
 597         struct ip_reply_arg arg;
 598 #ifdef CONFIG_TCP_MD5SIG
 599         struct tcp_md5sig_key *key;
 600         const __u8 *hash_location = NULL;
 601         unsigned char newhash[16];
 602         int genhash;
 603         struct sock *sk1 = NULL;
 604 #endif
 605         struct net *net;
 606
 607         /* Never send a reset in response to a reset. */
 608         if (th->rst)
 609                 return;
 610
 611         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 612                 return;
 613
 614         /* Swap the send and the receive. */
 615         memset(&rep, 0, sizeof(rep));
 616         rep.th.dest   = th->source;
 617         rep.th.source = th->dest;
 618         rep.th.doff   = sizeof(struct tcphdr) / 4;
 619         rep.th.rst    = 1;
 620
 621         if (th->ack) {
 622                 rep.th.seq = th->ack_seq;
 623         } else {
 624                 rep.th.ack = 1;
 625                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 626                                        skb->len - (th->doff << 2));
 627         }
 628
 629         memset(&arg, 0, sizeof(arg));
 630         arg.iov[0].iov_base = (unsigned char *)&rep;
 631         arg.iov[0].iov_len  = sizeof(rep.th);
 632
 633 #ifdef CONFIG_TCP_MD5SIG
 634         hash_location = tcp_parse_md5sig_option(th);
 635         if (!sk && hash_location) {
 636                 /*
 637                  * active side is lost. Try to find listening socket through
 638                  * source port, and then find md5 key through listening socket.
 639                  * we are not loose security here:
 640                  * Incoming packet is checked with md5 hash with finding key,
 641                  * no RST generated if md5 hash doesn't match.
 642                  */
 643                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 644                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 645                                              ntohs(th->source), inet_iif(skb));
 646                 /* don't send rst if it can't find key */
 647                 if (!sk1)
 648                         return;
 649                 rcu_read_lock();
 650                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 651                                         &ip_hdr(skb)->saddr, AF_INET);
 652                 if (!key)
 653                         goto release_sk1;
 654
 655                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 656                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 657                         goto release_sk1;
 658         } else {
 659                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 660                                              &ip_hdr(skb)->saddr,
 661                                              AF_INET) : NULL;
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683         /* When socket is gone, all binding information is lost.
 684          * routing might fail in this case. using iif for oif to
 685          * make sure we can deliver it
 686          */
 687         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 688
 689         net = dev_net(skb_dst(skb)->dev);
 690         arg.tos = ip_hdr(skb)->tos;
 691         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 692                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 693
 694         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 695         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 696
 697 #ifdef CONFIG_TCP_MD5SIG
 698 release_sk1:
 699         if (sk1) {
 700                 rcu_read_unlock();
 701                 sock_put(sk1);
 702         }
 703 #endif
 704 }
 705
 706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 707    outside socket context is ugly, certainly. What can I do?
 708  */
 709
 710 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 711                             u32 win, u32 ts, int oif,
 712                             struct tcp_md5sig_key *key,
 713                             int reply_flags, u8 tos)
 714 {
 715         const struct tcphdr *th = tcp_hdr(skb);
 716         struct {
 717                 struct tcphdr th;
 718                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 719 #ifdef CONFIG_TCP_MD5SIG
 720                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 721 #endif
 722                         ];
 723         } rep;
 724         struct ip_reply_arg arg;
 725         struct net *net = dev_net(skb_dst(skb)->dev);
 726
 727         memset(&rep.th, 0, sizeof(struct tcphdr));
 728         memset(&arg, 0, sizeof(arg));
 729
 730         arg.iov[0].iov_base = (unsigned char *)&rep;
 731         arg.iov[0].iov_len  = sizeof(rep.th);
 732         if (ts) {
 733                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 734                                    (TCPOPT_TIMESTAMP << 8) |
 735                                    TCPOLEN_TIMESTAMP);
 736                 rep.opt[1] = htonl(tcp_time_stamp);
 737                 rep.opt[2] = htonl(ts);
 738                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 739         }
 740
 741         /* Swap the send and the receive. */
 742         rep.th.dest    = th->source;
 743         rep.th.source  = th->dest;
 744         rep.th.doff    = arg.iov[0].iov_len / 4;
 745         rep.th.seq     = htonl(seq);
 746         rep.th.ack_seq = htonl(ack);
 747         rep.th.ack     = 1;
 748         rep.th.window  = htons(win);
 749
 750 #ifdef CONFIG_TCP_MD5SIG
 751         if (key) {
 752                 int offset = (ts) ? 3 : 0;
 753
 754                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 755                                           (TCPOPT_NOP << 16) |
 756                                           (TCPOPT_MD5SIG << 8) |
 757                                           TCPOLEN_MD5SIG);
 758                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 759                 rep.th.doff = arg.iov[0].iov_len/4;
 760
 761                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 762                                     key, ip_hdr(skb)->saddr,
 763                                     ip_hdr(skb)->daddr, &rep.th);
 764         }
 765 #endif
 766         arg.flags = reply_flags;
 767         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 768                                       ip_hdr(skb)->saddr, /* XXX */
 769                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 770         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 771         if (oif)
 772                 arg.bound_dev_if = oif;
 773         arg.tos = tos;
 774         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 775                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 776
 777         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 778 }
 779
 780 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 781 {
 782         struct inet_timewait_sock *tw = inet_twsk(sk);
 783         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 784
 785         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 786                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 787                         tcptw->tw_ts_recent,
 788                         tw->tw_bound_dev_if,
 789                         tcp_twsk_md5_key(tcptw),
 790                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 791                         tw->tw_tos
 792                         );
 793
 794         inet_twsk_put(tw);
 795 }
 796
 797 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 798                                   struct request_sock *req)
 799 {
 800         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 801                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 802                         req->ts_recent,
 803                         0,
 804                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 805                                           AF_INET),
 806                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 807                         ip_hdr(skb)->tos);
 808 }
 809
 810 /*
 811  *      Send a SYN-ACK after having received a SYN.
 812  *      This still operates on a request_sock only, not on a big
 813  *      socket.
 814  */
 815 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 816                               struct request_sock *req,
 817                               struct request_values *rvp,
 818                               u16 queue_mapping,
 819                               bool nocache)
 820 {
 821         const struct inet_request_sock *ireq = inet_rsk(req);
 822         struct flowi4 fl4;
 823         int err = -1;
 824         struct sk_buff * skb;
 825
 826         /* First, grab a route. */
 827         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
 828                 return -1;
 829
 830         skb = tcp_make_synack(sk, dst, req, rvp);
 831
 832         if (skb) {
 833                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 834
 835                 skb_set_queue_mapping(skb, queue_mapping);
 836                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 837                                             ireq->rmt_addr,
 838                                             ireq->opt);
 839                 err = net_xmit_eval(err);
 840         }
 841
 842         return err;
 843 }
 844
 845 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 846                               struct request_values *rvp)
 847 {
 848         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 849         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 850 }
 851
 852 /*
 853  *      IPv4 request_sock destructor.
 854  */
 855 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 856 {
 857         kfree(inet_rsk(req)->opt);
 858 }
 859
 860 /*
 861  * Return true if a syncookie should be sent
 862  */
 863 bool tcp_syn_flood_action(struct sock *sk,
 864                          const struct sk_buff *skb,
 865                          const char *proto)
 866 {
 867         const char *msg = "Dropping request";
 868         bool want_cookie = false;
 869         struct listen_sock *lopt;
 870
 871
 872
 873 #ifdef CONFIG_SYN_COOKIES
 874         if (sysctl_tcp_syncookies) {
 875                 msg = "Sending cookies";
 876                 want_cookie = true;
 877                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 878         } else
 879 #endif
 880                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 881
 882         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 883         if (!lopt->synflood_warned) {
 884                 lopt->synflood_warned = 1;
 885                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 886                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 887         }
 888         return want_cookie;
 889 }
 890 EXPORT_SYMBOL(tcp_syn_flood_action);
 891
 892 /*
 893  * Save and compile IPv4 options into the request_sock if needed.
 894  */
 895 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 896                                                   struct sk_buff *skb)
 897 {
 898         const struct ip_options *opt = &(IPCB(skb)->opt);
 899         struct ip_options_rcu *dopt = NULL;
 900
 901         if (opt && opt->optlen) {
 902                 int opt_size = sizeof(*dopt) + opt->optlen;
 903
 904                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 905                 if (dopt) {
 906                         if (ip_options_echo(&dopt->opt, skb)) {
 907                                 kfree(dopt);
 908                                 dopt = NULL;
 909                         }
 910                 }
 911         }
 912         return dopt;
 913 }
 914
 915 #ifdef CONFIG_TCP_MD5SIG
 916 /*
 917  * RFC2385 MD5 checksumming requires a mapping of
 918  * IP address->MD5 Key.
 919  * We need to maintain these in the sk structure.
 920  */
 921
 922 /* Find the Key structure for an address.  */
 923 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 924                                          const union tcp_md5_addr *addr,
 925                                          int family)
 926 {
 927         struct tcp_sock *tp = tcp_sk(sk);
 928         struct tcp_md5sig_key *key;
 929         struct hlist_node *pos;
 930         unsigned int size = sizeof(struct in_addr);
 931         struct tcp_md5sig_info *md5sig;
 932
 933         /* caller either holds rcu_read_lock() or socket lock */
 934         md5sig = rcu_dereference_check(tp->md5sig_info,
 935                                        sock_owned_by_user(sk) ||
 936                                        lockdep_is_held(&sk->sk_lock.slock));
 937         if (!md5sig)
 938                 return NULL;
 939 #if IS_ENABLED(CONFIG_IPV6)
 940         if (family == AF_INET6)
 941                 size = sizeof(struct in6_addr);
 942 #endif
 943         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 944                 if (key->family != family)
 945                         continue;
 946                 if (!memcmp(&key->addr, addr, size))
 947                         return key;
 948         }
 949         return NULL;
 950 }
 951 EXPORT_SYMBOL(tcp_md5_do_lookup);
 952
 953 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 954                                          struct sock *addr_sk)
 955 {
 956         union tcp_md5_addr *addr;
 957
 958         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 959         return tcp_md5_do_lookup(sk, addr, AF_INET);
 960 }
 961 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 962
 963 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 964                                                       struct request_sock *req)
 965 {
 966         union tcp_md5_addr *addr;
 967
 968         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 969         return tcp_md5_do_lookup(sk, addr, AF_INET);
 970 }
 971
 972 /* This can be called on a newly created socket, from other files */
 973 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 974                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 975 {
 976         /* Add Key to the list */
 977         struct tcp_md5sig_key *key;
 978         struct tcp_sock *tp = tcp_sk(sk);
 979         struct tcp_md5sig_info *md5sig;
 980
 981         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
 982         if (key) {
 983                 /* Pre-existing entry - just update that one. */
 984                 memcpy(key->key, newkey, newkeylen);
 985                 key->keylen = newkeylen;
 986                 return 0;
 987         }
 988
 989         md5sig = rcu_dereference_protected(tp->md5sig_info,
 990                                            sock_owned_by_user(sk));
 991         if (!md5sig) {
 992                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 993                 if (!md5sig)
 994                         return -ENOMEM;
 995
 996                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 997                 INIT_HLIST_HEAD(&md5sig->head);
 998                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 999         }
1000
1001         key = sock_kmalloc(sk, sizeof(*key), gfp);
1002         if (!key)
1003                 return -ENOMEM;
1004         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1005                 sock_kfree_s(sk, key, sizeof(*key));
1006                 return -ENOMEM;
1007         }
1008
1009         memcpy(key->key, newkey, newkeylen);
1010         key->keylen = newkeylen;
1011         key->family = family;
1012         memcpy(&key->addr, addr,
1013                (family == AF_INET6) ? sizeof(struct in6_addr) :
1014                                       sizeof(struct in_addr));
1015         hlist_add_head_rcu(&key->node, &md5sig->head);
1016         return 0;
1017 }
1018 EXPORT_SYMBOL(tcp_md5_do_add);
1019
1020 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1021 {
1022         struct tcp_sock *tp = tcp_sk(sk);
1023         struct tcp_md5sig_key *key;
1024         struct tcp_md5sig_info *md5sig;
1025
1026         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1027         if (!key)
1028                 return -ENOENT;
1029         hlist_del_rcu(&key->node);
1030         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1031         kfree_rcu(key, rcu);
1032         md5sig = rcu_dereference_protected(tp->md5sig_info,
1033                                            sock_owned_by_user(sk));
1034         if (hlist_empty(&md5sig->head))
1035                 tcp_free_md5sig_pool();
1036         return 0;
1037 }
1038 EXPORT_SYMBOL(tcp_md5_do_del);
1039
1040 void tcp_clear_md5_list(struct sock *sk)
1041 {
1042         struct tcp_sock *tp = tcp_sk(sk);
1043         struct tcp_md5sig_key *key;
1044         struct hlist_node *pos, *n;
1045         struct tcp_md5sig_info *md5sig;
1046
1047         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1048
1049         if (!hlist_empty(&md5sig->head))
1050                 tcp_free_md5sig_pool();
1051         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1052                 hlist_del_rcu(&key->node);
1053                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1054                 kfree_rcu(key, rcu);
1055         }
1056 }
1057
1058 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1059                                  int optlen)
1060 {
1061         struct tcp_md5sig cmd;
1062         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1063
1064         if (optlen < sizeof(cmd))
1065                 return -EINVAL;
1066
1067         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1068                 return -EFAULT;
1069
1070         if (sin->sin_family != AF_INET)
1071                 return -EINVAL;
1072
1073         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1074                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1075                                       AF_INET);
1076
1077         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1078                 return -EINVAL;
1079
1080         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1081                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1082                               GFP_KERNEL);
1083 }
1084
1085 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1086                                         __be32 daddr, __be32 saddr, int nbytes)
1087 {
1088         struct tcp4_pseudohdr *bp;
1089         struct scatterlist sg;
1090
1091         bp = &hp->md5_blk.ip4;
1092
1093         /*
1094          * 1. the TCP pseudo-header (in the order: source IP address,
1095          * destination IP address, zero-padded protocol number, and
1096          * segment length)
1097          */
1098         bp->saddr = saddr;
1099         bp->daddr = daddr;
1100         bp->pad = 0;
1101         bp->protocol = IPPROTO_TCP;
1102         bp->len = cpu_to_be16(nbytes);
1103
1104         sg_init_one(&sg, bp, sizeof(*bp));
1105         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1106 }
1107
1108 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1109                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1110 {
1111         struct tcp_md5sig_pool *hp;
1112         struct hash_desc *desc;
1113
1114         hp = tcp_get_md5sig_pool();
1115         if (!hp)
1116                 goto clear_hash_noput;
1117         desc = &hp->md5_desc;
1118
1119         if (crypto_hash_init(desc))
1120                 goto clear_hash;
1121         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_header(hp, th))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         if (crypto_hash_final(desc, md5_hash))
1128                 goto clear_hash;
1129
1130         tcp_put_md5sig_pool();
1131         return 0;
1132
1133 clear_hash:
1134         tcp_put_md5sig_pool();
1135 clear_hash_noput:
1136         memset(md5_hash, 0, 16);
1137         return 1;
1138 }
1139
1140 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1141                         const struct sock *sk, const struct request_sock *req,
1142                         const struct sk_buff *skb)
1143 {
1144         struct tcp_md5sig_pool *hp;
1145         struct hash_desc *desc;
1146         const struct tcphdr *th = tcp_hdr(skb);
1147         __be32 saddr, daddr;
1148
1149         if (sk) {
1150                 saddr = inet_sk(sk)->inet_saddr;
1151                 daddr = inet_sk(sk)->inet_daddr;
1152         } else if (req) {
1153                 saddr = inet_rsk(req)->loc_addr;
1154                 daddr = inet_rsk(req)->rmt_addr;
1155         } else {
1156                 const struct iphdr *iph = ip_hdr(skb);
1157                 saddr = iph->saddr;
1158                 daddr = iph->daddr;
1159         }
1160
1161         hp = tcp_get_md5sig_pool();
1162         if (!hp)
1163                 goto clear_hash_noput;
1164         desc = &hp->md5_desc;
1165
1166         if (crypto_hash_init(desc))
1167                 goto clear_hash;
1168
1169         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1170                 goto clear_hash;
1171         if (tcp_md5_hash_header(hp, th))
1172                 goto clear_hash;
1173         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1174                 goto clear_hash;
1175         if (tcp_md5_hash_key(hp, key))
1176                 goto clear_hash;
1177         if (crypto_hash_final(desc, md5_hash))
1178                 goto clear_hash;
1179
1180         tcp_put_md5sig_pool();
1181         return 0;
1182
1183 clear_hash:
1184         tcp_put_md5sig_pool();
1185 clear_hash_noput:
1186         memset(md5_hash, 0, 16);
1187         return 1;
1188 }
1189 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1190
1191 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1192 {
1193         /*
1194          * This gets called for each TCP segment that arrives
1195          * so we want to be efficient.
1196          * We have 3 drop cases:
1197          * o No MD5 hash and one expected.
1198          * o MD5 hash and we're not expecting one.
1199          * o MD5 hash and its wrong.
1200          */
1201         const __u8 *hash_location = NULL;
1202         struct tcp_md5sig_key *hash_expected;
1203         const struct iphdr *iph = ip_hdr(skb);
1204         const struct tcphdr *th = tcp_hdr(skb);
1205         int genhash;
1206         unsigned char newhash[16];
1207
1208         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1209                                           AF_INET);
1210         hash_location = tcp_parse_md5sig_option(th);
1211
1212         /* We've parsed the options - do we have a hash? */
1213         if (!hash_expected && !hash_location)
1214                 return false;
1215
1216         if (hash_expected && !hash_location) {
1217                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1218                 return true;
1219         }
1220
1221         if (!hash_expected && hash_location) {
1222                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1223                 return true;
1224         }
1225
1226         /* Okay, so this is hash_expected and hash_location -
1227          * so we need to calculate the checksum.
1228          */
1229         genhash = tcp_v4_md5_hash_skb(newhash,
1230                                       hash_expected,
1231                                       NULL, NULL, skb);
1232
1233         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1234                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1235                                      &iph->saddr, ntohs(th->source),
1236                                      &iph->daddr, ntohs(th->dest),
1237                                      genhash ? " tcp_v4_calc_md5_hash failed"
1238                                      : "");
1239                 return true;
1240         }
1241         return false;
1242 }
1243
1244 #endif
1245
1246 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1247         .family         =       PF_INET,
1248         .obj_size       =       sizeof(struct tcp_request_sock),
1249         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1250         .send_ack       =       tcp_v4_reqsk_send_ack,
1251         .destructor     =       tcp_v4_reqsk_destructor,
1252         .send_reset     =       tcp_v4_send_reset,
1253         .syn_ack_timeout =      tcp_syn_ack_timeout,
1254 };
1255
1256 #ifdef CONFIG_TCP_MD5SIG
1257 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1258         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1259         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1260 };
1261 #endif
1262
1263 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1264 {
1265         struct tcp_extend_values tmp_ext;
1266         struct tcp_options_received tmp_opt;
1267         const u8 *hash_location;
1268         struct request_sock *req;
1269         struct inet_request_sock *ireq;
1270         struct tcp_sock *tp = tcp_sk(sk);
1271         struct dst_entry *dst = NULL;
1272         __be32 saddr = ip_hdr(skb)->saddr;
1273         __be32 daddr = ip_hdr(skb)->daddr;
1274         __u32 isn = TCP_SKB_CB(skb)->when;
1275         bool want_cookie = false;
1276
1277         /* Never answer to SYNs send to broadcast or multicast */
1278         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1279                 goto drop;
1280
1281         /* TW buckets are converted to open requests without
1282          * limitations, they conserve resources and peer is
1283          * evidently real one.
1284          */
1285         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1286                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1287                 if (!want_cookie)
1288                         goto drop;
1289         }
1290
1291         /* Accept backlog is full. If we have already queued enough
1292          * of warm entries in syn queue, drop request. It is better than
1293          * clogging syn queue with openreqs with exponentially increasing
1294          * timeout.
1295          */
1296         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1297                 goto drop;
1298
1299         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1300         if (!req)
1301                 goto drop;
1302
1303 #ifdef CONFIG_TCP_MD5SIG
1304         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1305 #endif
1306
1307         tcp_clear_options(&tmp_opt);
1308         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1309         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1310         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1311
1312         if (tmp_opt.cookie_plus > 0 &&
1313             tmp_opt.saw_tstamp &&
1314             !tp->rx_opt.cookie_out_never &&
1315             (sysctl_tcp_cookie_size > 0 ||
1316              (tp->cookie_values != NULL &&
1317               tp->cookie_values->cookie_desired > 0))) {
1318                 u8 *c;
1319                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1320                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1321
1322                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1323                         goto drop_and_release;
1324
1325                 /* Secret recipe starts with IP addresses */
1326                 *mess++ ^= (__force u32)daddr;
1327                 *mess++ ^= (__force u32)saddr;
1328
1329                 /* plus variable length Initiator Cookie */
1330                 c = (u8 *)mess;
1331                 while (l-- > 0)
1332                         *c++ ^= *hash_location++;
1333
1334                 want_cookie = false;    /* not our kind of cookie */
1335                 tmp_ext.cookie_out_never = 0; /* false */
1336                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1337         } else if (!tp->rx_opt.cookie_in_always) {
1338                 /* redundant indications, but ensure initialization. */
1339                 tmp_ext.cookie_out_never = 1; /* true */
1340                 tmp_ext.cookie_plus = 0;
1341         } else {
1342                 goto drop_and_release;
1343         }
1344         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1345
1346         if (want_cookie && !tmp_opt.saw_tstamp)
1347                 tcp_clear_options(&tmp_opt);
1348
1349         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1350         tcp_openreq_init(req, &tmp_opt, skb);
1351
1352         ireq = inet_rsk(req);
1353         ireq->loc_addr = daddr;
1354         ireq->rmt_addr = saddr;
1355         ireq->no_srccheck = inet_sk(sk)->transparent;
1356         ireq->opt = tcp_v4_save_options(sk, skb);
1357
1358         if (security_inet_conn_request(sk, skb, req))
1359                 goto drop_and_free;
1360
1361         if (!want_cookie || tmp_opt.tstamp_ok)
1362                 TCP_ECN_create_request(req, skb);
1363
1364         if (want_cookie) {
1365                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1366                 req->cookie_ts = tmp_opt.tstamp_ok;
1367         } else if (!isn) {
1368                 struct flowi4 fl4;
1369
1370                 /* VJ's idea. We save last timestamp seen
1371                  * from the destination in peer table, when entering
1372                  * state TIME-WAIT, and check against it before
1373                  * accepting new connection request.
1374                  *
1375                  * If "isn" is not zero, this request hit alive
1376                  * timewait bucket, so that all the necessary checks
1377                  * are made in the function processing timewait state.
1378                  */
1379                 if (tmp_opt.saw_tstamp &&
1380                     tcp_death_row.sysctl_tw_recycle &&
1381                     (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1382                     fl4.daddr == saddr) {
1383                         if (!tcp_peer_is_proven(req, dst, true)) {
1384                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1385                                 goto drop_and_release;
1386                         }
1387                 }
1388                 /* Kill the following clause, if you dislike this way. */
1389                 else if (!sysctl_tcp_syncookies &&
1390                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1391                           (sysctl_max_syn_backlog >> 2)) &&
1392                          !tcp_peer_is_proven(req, dst, false)) {
1393                         /* Without syncookies last quarter of
1394                          * backlog is filled with destinations,
1395                          * proven to be alive.
1396                          * It means that we continue to communicate
1397                          * to destinations, already remembered
1398                          * to the moment of synflood.
1399                          */
1400                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1401                                        &saddr, ntohs(tcp_hdr(skb)->source));
1402                         goto drop_and_release;
1403                 }
1404
1405                 isn = tcp_v4_init_sequence(skb);
1406         }
1407         tcp_rsk(req)->snt_isn = isn;
1408         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1409
1410         if (tcp_v4_send_synack(sk, dst, req,
1411                                (struct request_values *)&tmp_ext,
1412                                skb_get_queue_mapping(skb),
1413                                want_cookie) ||
1414             want_cookie)
1415                 goto drop_and_free;
1416
1417         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1418         return 0;
1419
1420 drop_and_release:
1421         dst_release(dst);
1422 drop_and_free:
1423         reqsk_free(req);
1424 drop:
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(tcp_v4_conn_request);
1428
1429
1430 /*
1431  * The three way handshake has completed - we got a valid synack -
1432  * now create the new socket.
1433  */
1434 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1435                                   struct request_sock *req,
1436                                   struct dst_entry *dst)
1437 {
1438         struct inet_request_sock *ireq;
1439         struct inet_sock *newinet;
1440         struct tcp_sock *newtp;
1441         struct sock *newsk;
1442 #ifdef CONFIG_TCP_MD5SIG
1443         struct tcp_md5sig_key *key;
1444 #endif
1445         struct ip_options_rcu *inet_opt;
1446
1447         if (sk_acceptq_is_full(sk))
1448                 goto exit_overflow;
1449
1450         newsk = tcp_create_openreq_child(sk, req, skb);
1451         if (!newsk)
1452                 goto exit_nonewsk;
1453
1454         newsk->sk_gso_type = SKB_GSO_TCPV4;
1455
1456         newtp                 = tcp_sk(newsk);
1457         newinet               = inet_sk(newsk);
1458         ireq                  = inet_rsk(req);
1459         newinet->inet_daddr   = ireq->rmt_addr;
1460         newinet->inet_rcv_saddr = ireq->loc_addr;
1461         newinet->inet_saddr           = ireq->loc_addr;
1462         inet_opt              = ireq->opt;
1463         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1464         ireq->opt             = NULL;
1465         newinet->mc_index     = inet_iif(skb);
1466         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1467         newinet->rcv_tos      = ip_hdr(skb)->tos;
1468         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1469         if (inet_opt)
1470                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1471         newinet->inet_id = newtp->write_seq ^ jiffies;
1472
1473         if (!dst) {
1474                 dst = inet_csk_route_child_sock(sk, newsk, req);
1475                 if (!dst)
1476                         goto put_and_exit;
1477         } else {
1478                 /* syncookie case : see end of cookie_v4_check() */
1479         }
1480         sk_setup_caps(newsk, dst);
1481
1482         tcp_mtup_init(newsk);
1483         tcp_sync_mss(newsk, dst_mtu(dst));
1484         newtp->advmss = dst_metric_advmss(dst);
1485         if (tcp_sk(sk)->rx_opt.user_mss &&
1486             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1487                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1488
1489         tcp_initialize_rcv_mss(newsk);
1490         if (tcp_rsk(req)->snt_synack)
1491                 tcp_valid_rtt_meas(newsk,
1492                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1493         newtp->total_retrans = req->retrans;
1494
1495 #ifdef CONFIG_TCP_MD5SIG
1496         /* Copy over the MD5 key from the original socket */
1497         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1498                                 AF_INET);
1499         if (key != NULL) {
1500                 /*
1501                  * We're using one, so create a matching key
1502                  * on the newsk structure. If we fail to get
1503                  * memory, then we end up not copying the key
1504                  * across. Shucks.
1505                  */
1506                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1507                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1508                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1509         }
1510 #endif
1511
1512         if (__inet_inherit_port(sk, newsk) < 0)
1513                 goto put_and_exit;
1514         __inet_hash_nolisten(newsk, NULL);
1515
1516         return newsk;
1517
1518 exit_overflow:
1519         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1520 exit_nonewsk:
1521         dst_release(dst);
1522 exit:
1523         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1524         return NULL;
1525 put_and_exit:
1526         tcp_clear_xmit_timers(newsk);
1527         tcp_cleanup_congestion_control(newsk);
1528         bh_unlock_sock(newsk);
1529         sock_put(newsk);
1530         goto exit;
1531 }
1532 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1533
1534 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1535 {
1536         struct tcphdr *th = tcp_hdr(skb);
1537         const struct iphdr *iph = ip_hdr(skb);
1538         struct sock *nsk;
1539         struct request_sock **prev;
1540         /* Find possible connection requests. */
1541         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1542                                                        iph->saddr, iph->daddr);
1543         if (req)
1544                 return tcp_check_req(sk, skb, req, prev);
1545
1546         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1547                         th->source, iph->daddr, th->dest, inet_iif(skb));
1548
1549         if (nsk) {
1550                 if (nsk->sk_state != TCP_TIME_WAIT) {
1551                         bh_lock_sock(nsk);
1552                         return nsk;
1553                 }
1554                 inet_twsk_put(inet_twsk(nsk));
1555                 return NULL;
1556         }
1557
1558 #ifdef CONFIG_SYN_COOKIES
1559         if (!th->syn)
1560                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1561 #endif
1562         return sk;
1563 }
1564
1565 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1566 {
1567         const struct iphdr *iph = ip_hdr(skb);
1568
1569         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1570                 if (!tcp_v4_check(skb->len, iph->saddr,
1571                                   iph->daddr, skb->csum)) {
1572                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1573                         return 0;
1574                 }
1575         }
1576
1577         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1578                                        skb->len, IPPROTO_TCP, 0);
1579
1580         if (skb->len <= 76) {
1581                 return __skb_checksum_complete(skb);
1582         }
1583         return 0;
1584 }
1585
1586
1587 /* The socket must have it's spinlock held when we get
1588  * here.
1589  *
1590  * We have a potential double-lock case here, so even when
1591  * doing backlog processing we use the BH locking scheme.
1592  * This is because we cannot sleep with the original spinlock
1593  * held.
1594  */
1595 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1596 {
1597         struct sock *rsk;
1598 #ifdef CONFIG_TCP_MD5SIG
1599         /*
1600          * We really want to reject the packet as early as possible
1601          * if:
1602          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1603          *  o There is an MD5 option and we're not expecting one
1604          */
1605         if (tcp_v4_inbound_md5_hash(sk, skb))
1606                 goto discard;
1607 #endif
1608
1609         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1610                 sock_rps_save_rxhash(sk, skb);
1611                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1612                         rsk = sk;
1613                         goto reset;
1614                 }
1615                 return 0;
1616         }
1617
1618         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1619                 goto csum_err;
1620
1621         if (sk->sk_state == TCP_LISTEN) {
1622                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1623                 if (!nsk)
1624                         goto discard;
1625
1626                 if (nsk != sk) {
1627                         sock_rps_save_rxhash(nsk, skb);
1628                         if (tcp_child_process(sk, nsk, skb)) {
1629                                 rsk = nsk;
1630                                 goto reset;
1631                         }
1632                         return 0;
1633                 }
1634         } else
1635                 sock_rps_save_rxhash(sk, skb);
1636
1637         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1638                 rsk = sk;
1639                 goto reset;
1640         }
1641         return 0;
1642
1643 reset:
1644         tcp_v4_send_reset(rsk, skb);
1645 discard:
1646         kfree_skb(skb);
1647         /* Be careful here. If this function gets more complicated and
1648          * gcc suffers from register pressure on the x86, sk (in %ebx)
1649          * might be destroyed here. This current version compiles correctly,
1650          * but you have been warned.
1651          */
1652         return 0;
1653
1654 csum_err:
1655         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1656         goto discard;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_do_rcv);
1659
1660 void tcp_v4_early_demux(struct sk_buff *skb)
1661 {
1662         struct net *net = dev_net(skb->dev);
1663         const struct iphdr *iph;
1664         const struct tcphdr *th;
1665         struct net_device *dev;
1666         struct sock *sk;
1667
1668         if (skb->pkt_type != PACKET_HOST)
1669                 return;
1670
1671         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1672                 return;
1673
1674         iph = ip_hdr(skb);
1675         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1676
1677         if (th->doff < sizeof(struct tcphdr) / 4)
1678                 return;
1679
1680         if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1681                 return;
1682
1683         dev = skb->dev;
1684         sk = __inet_lookup_established(net, &tcp_hashinfo,
1685                                        iph->saddr, th->source,
1686                                        iph->daddr, ntohs(th->dest),
1687                                        dev->ifindex);
1688         if (sk) {
1689                 skb->sk = sk;
1690                 skb->destructor = sock_edemux;
1691                 if (sk->sk_state != TCP_TIME_WAIT) {
1692                         struct dst_entry *dst = sk->sk_rx_dst;
1693                         if (dst)
1694                                 dst = dst_check(dst, 0);
1695                         if (dst) {
1696                                 struct rtable *rt = (struct rtable *) dst;
1697
1698                                 if (rt->rt_iif == dev->ifindex)
1699                                         skb_dst_set_noref(skb, dst);
1700                         }
1701                 }
1702         }
1703 }
1704
1705 /*
1706  *      From tcp_input.c
1707  */
1708
1709 int tcp_v4_rcv(struct sk_buff *skb)
1710 {
1711         const struct iphdr *iph;
1712         const struct tcphdr *th;
1713         struct sock *sk;
1714         int ret;
1715         struct net *net = dev_net(skb->dev);
1716
1717         if (skb->pkt_type != PACKET_HOST)
1718                 goto discard_it;
1719
1720         /* Count it even if it's bad */
1721         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1722
1723         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1724                 goto discard_it;
1725
1726         th = tcp_hdr(skb);
1727
1728         if (th->doff < sizeof(struct tcphdr) / 4)
1729                 goto bad_packet;
1730         if (!pskb_may_pull(skb, th->doff * 4))
1731                 goto discard_it;
1732
1733         /* An explanation is required here, I think.
1734          * Packet length and doff are validated by header prediction,
1735          * provided case of th->doff==0 is eliminated.
1736          * So, we defer the checks. */
1737         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1738                 goto bad_packet;
1739
1740         th = tcp_hdr(skb);
1741         iph = ip_hdr(skb);
1742         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1743         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1744                                     skb->len - th->doff * 4);
1745         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1746         TCP_SKB_CB(skb)->when    = 0;
1747         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1748         TCP_SKB_CB(skb)->sacked  = 0;
1749
1750         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1751         if (!sk)
1752                 goto no_tcp_socket;
1753
1754 process:
1755         if (sk->sk_state == TCP_TIME_WAIT)
1756                 goto do_time_wait;
1757
1758         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1759                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1760                 goto discard_and_relse;
1761         }
1762
1763         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1764                 goto discard_and_relse;
1765         nf_reset(skb);
1766
1767         if (sk_filter(sk, skb))
1768                 goto discard_and_relse;
1769
1770         skb->dev = NULL;
1771
1772         bh_lock_sock_nested(sk);
1773         ret = 0;
1774         if (!sock_owned_by_user(sk)) {
1775 #ifdef CONFIG_NET_DMA
1776                 struct tcp_sock *tp = tcp_sk(sk);
1777                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1778                         tp->ucopy.dma_chan = net_dma_find_channel();
1779                 if (tp->ucopy.dma_chan)
1780                         ret = tcp_v4_do_rcv(sk, skb);
1781                 else
1782 #endif
1783                 {
1784                         if (!tcp_prequeue(sk, skb))
1785                                 ret = tcp_v4_do_rcv(sk, skb);
1786                 }
1787         } else if (unlikely(sk_add_backlog(sk, skb,
1788                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1789                 bh_unlock_sock(sk);
1790                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1791                 goto discard_and_relse;
1792         }
1793         bh_unlock_sock(sk);
1794
1795         sock_put(sk);
1796
1797         return ret;
1798
1799 no_tcp_socket:
1800         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1801                 goto discard_it;
1802
1803         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1804 bad_packet:
1805                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1806         } else {
1807                 tcp_v4_send_reset(NULL, skb);
1808         }
1809
1810 discard_it:
1811         /* Discard frame. */
1812         kfree_skb(skb);
1813         return 0;
1814
1815 discard_and_relse:
1816         sock_put(sk);
1817         goto discard_it;
1818
1819 do_time_wait:
1820         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1821                 inet_twsk_put(inet_twsk(sk));
1822                 goto discard_it;
1823         }
1824
1825         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1826                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1827                 inet_twsk_put(inet_twsk(sk));
1828                 goto discard_it;
1829         }
1830         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1831         case TCP_TW_SYN: {
1832                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1833                                                         &tcp_hashinfo,
1834                                                         iph->daddr, th->dest,
1835                                                         inet_iif(skb));
1836                 if (sk2) {
1837                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1838                         inet_twsk_put(inet_twsk(sk));
1839                         sk = sk2;
1840                         goto process;
1841                 }
1842                 /* Fall through to ACK */
1843         }
1844         case TCP_TW_ACK:
1845                 tcp_v4_timewait_ack(sk, skb);
1846                 break;
1847         case TCP_TW_RST:
1848                 goto no_tcp_socket;
1849         case TCP_TW_SUCCESS:;
1850         }
1851         goto discard_it;
1852 }
1853
1854 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1855         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1856         .twsk_unique    = tcp_twsk_unique,
1857         .twsk_destructor= tcp_twsk_destructor,
1858 };
1859
1860 const struct inet_connection_sock_af_ops ipv4_specific = {
1861         .queue_xmit        = ip_queue_xmit,
1862         .send_check        = tcp_v4_send_check,
1863         .rebuild_header    = inet_sk_rebuild_header,
1864         .conn_request      = tcp_v4_conn_request,
1865         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1866         .net_header_len    = sizeof(struct iphdr),
1867         .setsockopt        = ip_setsockopt,
1868         .getsockopt        = ip_getsockopt,
1869         .addr2sockaddr     = inet_csk_addr2sockaddr,
1870         .sockaddr_len      = sizeof(struct sockaddr_in),
1871         .bind_conflict     = inet_csk_bind_conflict,
1872 #ifdef CONFIG_COMPAT
1873         .compat_setsockopt = compat_ip_setsockopt,
1874         .compat_getsockopt = compat_ip_getsockopt,
1875 #endif
1876 };
1877 EXPORT_SYMBOL(ipv4_specific);
1878
1879 #ifdef CONFIG_TCP_MD5SIG
1880 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1881         .md5_lookup             = tcp_v4_md5_lookup,
1882         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1883         .md5_parse              = tcp_v4_parse_md5_keys,
1884 };
1885 #endif
1886
1887 /* NOTE: A lot of things set to zero explicitly by call to
1888  *       sk_alloc() so need not be done here.
1889  */
1890 static int tcp_v4_init_sock(struct sock *sk)
1891 {
1892         struct inet_connection_sock *icsk = inet_csk(sk);
1893
1894         tcp_init_sock(sk);
1895
1896         icsk->icsk_af_ops = &ipv4_specific;
1897
1898 #ifdef CONFIG_TCP_MD5SIG
1899         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1900 #endif
1901
1902         return 0;
1903 }
1904
1905 void tcp_v4_destroy_sock(struct sock *sk)
1906 {
1907         struct tcp_sock *tp = tcp_sk(sk);
1908
1909         tcp_clear_xmit_timers(sk);
1910
1911         tcp_cleanup_congestion_control(sk);
1912
1913         /* Cleanup up the write buffer. */
1914         tcp_write_queue_purge(sk);
1915
1916         /* Cleans up our, hopefully empty, out_of_order_queue. */
1917         __skb_queue_purge(&tp->out_of_order_queue);
1918
1919 #ifdef CONFIG_TCP_MD5SIG
1920         /* Clean up the MD5 key list, if any */
1921         if (tp->md5sig_info) {
1922                 tcp_clear_md5_list(sk);
1923                 kfree_rcu(tp->md5sig_info, rcu);
1924                 tp->md5sig_info = NULL;
1925         }
1926 #endif
1927
1928 #ifdef CONFIG_NET_DMA
1929         /* Cleans up our sk_async_wait_queue */
1930         __skb_queue_purge(&sk->sk_async_wait_queue);
1931 #endif
1932
1933         /* Clean prequeue, it must be empty really */
1934         __skb_queue_purge(&tp->ucopy.prequeue);
1935
1936         /* Clean up a referenced TCP bind bucket. */
1937         if (inet_csk(sk)->icsk_bind_hash)
1938                 inet_put_port(sk);
1939
1940         /*
1941          * If sendmsg cached page exists, toss it.
1942          */
1943         if (sk->sk_sndmsg_page) {
1944                 __free_page(sk->sk_sndmsg_page);
1945                 sk->sk_sndmsg_page = NULL;
1946         }
1947
1948         /* TCP Cookie Transactions */
1949         if (tp->cookie_values != NULL) {
1950                 kref_put(&tp->cookie_values->kref,
1951                          tcp_cookie_values_release);
1952                 tp->cookie_values = NULL;
1953         }
1954
1955         sk_sockets_allocated_dec(sk);
1956         sock_release_memcg(sk);
1957 }
1958 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1959
1960 #ifdef CONFIG_PROC_FS
1961 /* Proc filesystem TCP sock list dumping. */
1962
1963 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1964 {
1965         return hlist_nulls_empty(head) ? NULL :
1966                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1967 }
1968
1969 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1970 {
1971         return !is_a_nulls(tw->tw_node.next) ?
1972                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1973 }
1974
1975 /*
1976  * Get next listener socket follow cur.  If cur is NULL, get first socket
1977  * starting from bucket given in st->bucket; when st->bucket is zero the
1978  * very first socket in the hash table is returned.
1979  */
1980 static void *listening_get_next(struct seq_file *seq, void *cur)
1981 {
1982         struct inet_connection_sock *icsk;
1983         struct hlist_nulls_node *node;
1984         struct sock *sk = cur;
1985         struct inet_listen_hashbucket *ilb;
1986         struct tcp_iter_state *st = seq->private;
1987         struct net *net = seq_file_net(seq);
1988
1989         if (!sk) {
1990                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1991                 spin_lock_bh(&ilb->lock);
1992                 sk = sk_nulls_head(&ilb->head);
1993                 st->offset = 0;
1994                 goto get_sk;
1995         }
1996         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1997         ++st->num;
1998         ++st->offset;
1999
2000         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2001                 struct request_sock *req = cur;
2002
2003                 icsk = inet_csk(st->syn_wait_sk);
2004                 req = req->dl_next;
2005                 while (1) {
2006                         while (req) {
2007                                 if (req->rsk_ops->family == st->family) {
2008                                         cur = req;
2009                                         goto out;
2010                                 }
2011                                 req = req->dl_next;
2012                         }
2013                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2014                                 break;
2015 get_req:
2016                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2017                 }
2018                 sk        = sk_nulls_next(st->syn_wait_sk);
2019                 st->state = TCP_SEQ_STATE_LISTENING;
2020                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021         } else {
2022                 icsk = inet_csk(sk);
2023                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2025                         goto start_req;
2026                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2027                 sk = sk_nulls_next(sk);
2028         }
2029 get_sk:
2030         sk_nulls_for_each_from(sk, node) {
2031                 if (!net_eq(sock_net(sk), net))
2032                         continue;
2033                 if (sk->sk_family == st->family) {
2034                         cur = sk;
2035                         goto out;
2036                 }
2037                 icsk = inet_csk(sk);
2038                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2040 start_req:
2041                         st->uid         = sock_i_uid(sk);
2042                         st->syn_wait_sk = sk;
2043                         st->state       = TCP_SEQ_STATE_OPENREQ;
2044                         st->sbucket     = 0;
2045                         goto get_req;
2046                 }
2047                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2048         }
2049         spin_unlock_bh(&ilb->lock);
2050         st->offset = 0;
2051         if (++st->bucket < INET_LHTABLE_SIZE) {
2052                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2053                 spin_lock_bh(&ilb->lock);
2054                 sk = sk_nulls_head(&ilb->head);
2055                 goto get_sk;
2056         }
2057         cur = NULL;
2058 out:
2059         return cur;
2060 }
2061
2062 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2063 {
2064         struct tcp_iter_state *st = seq->private;
2065         void *rc;
2066
2067         st->bucket = 0;
2068         st->offset = 0;
2069         rc = listening_get_next(seq, NULL);
2070
2071         while (rc && *pos) {
2072                 rc = listening_get_next(seq, rc);
2073                 --*pos;
2074         }
2075         return rc;
2076 }
2077
2078 static inline bool empty_bucket(struct tcp_iter_state *st)
2079 {
2080         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2081                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2082 }
2083
2084 /*
2085  * Get first established socket starting from bucket given in st->bucket.
2086  * If st->bucket is zero, the very first socket in the hash is returned.
2087  */
2088 static void *established_get_first(struct seq_file *seq)
2089 {
2090         struct tcp_iter_state *st = seq->private;
2091         struct net *net = seq_file_net(seq);
2092         void *rc = NULL;
2093
2094         st->offset = 0;
2095         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2096                 struct sock *sk;
2097                 struct hlist_nulls_node *node;
2098                 struct inet_timewait_sock *tw;
2099                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2100
2101                 /* Lockless fast path for the common case of empty buckets */
2102                 if (empty_bucket(st))
2103                         continue;
2104
2105                 spin_lock_bh(lock);
2106                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2107                         if (sk->sk_family != st->family ||
2108                             !net_eq(sock_net(sk), net)) {
2109                                 continue;
2110                         }
2111                         rc = sk;
2112                         goto out;
2113                 }
2114                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2115                 inet_twsk_for_each(tw, node,
2116                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2117                         if (tw->tw_family != st->family ||
2118                             !net_eq(twsk_net(tw), net)) {
2119                                 continue;
2120                         }
2121                         rc = tw;
2122                         goto out;
2123                 }
2124                 spin_unlock_bh(lock);
2125                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2126         }
2127 out:
2128         return rc;
2129 }
2130
2131 static void *established_get_next(struct seq_file *seq, void *cur)
2132 {
2133         struct sock *sk = cur;
2134         struct inet_timewait_sock *tw;
2135         struct hlist_nulls_node *node;
2136         struct tcp_iter_state *st = seq->private;
2137         struct net *net = seq_file_net(seq);
2138
2139         ++st->num;
2140         ++st->offset;
2141
2142         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2143                 tw = cur;
2144                 tw = tw_next(tw);
2145 get_tw:
2146                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2147                         tw = tw_next(tw);
2148                 }
2149                 if (tw) {
2150                         cur = tw;
2151                         goto out;
2152                 }
2153                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2154                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2155
2156                 /* Look for next non empty bucket */
2157                 st->offset = 0;
2158                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2159                                 empty_bucket(st))
2160                         ;
2161                 if (st->bucket > tcp_hashinfo.ehash_mask)
2162                         return NULL;
2163
2164                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2166         } else
2167                 sk = sk_nulls_next(sk);
2168
2169         sk_nulls_for_each_from(sk, node) {
2170                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2171                         goto found;
2172         }
2173
2174         st->state = TCP_SEQ_STATE_TIME_WAIT;
2175         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2176         goto get_tw;
2177 found:
2178         cur = sk;
2179 out:
2180         return cur;
2181 }
2182
2183 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2184 {
2185         struct tcp_iter_state *st = seq->private;
2186         void *rc;
2187
2188         st->bucket = 0;
2189         rc = established_get_first(seq);
2190
2191         while (rc && pos) {
2192                 rc = established_get_next(seq, rc);
2193                 --pos;
2194         }
2195         return rc;
2196 }
2197
2198 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2199 {
2200         void *rc;
2201         struct tcp_iter_state *st = seq->private;
2202
2203         st->state = TCP_SEQ_STATE_LISTENING;
2204         rc        = listening_get_idx(seq, &pos);
2205
2206         if (!rc) {
2207                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2208                 rc        = established_get_idx(seq, pos);
2209         }
2210
2211         return rc;
2212 }
2213
2214 static void *tcp_seek_last_pos(struct seq_file *seq)
2215 {
2216         struct tcp_iter_state *st = seq->private;
2217         int offset = st->offset;
2218         int orig_num = st->num;
2219         void *rc = NULL;
2220
2221         switch (st->state) {
2222         case TCP_SEQ_STATE_OPENREQ:
2223         case TCP_SEQ_STATE_LISTENING:
2224                 if (st->bucket >= INET_LHTABLE_SIZE)
2225                         break;
2226                 st->state = TCP_SEQ_STATE_LISTENING;
2227                 rc = listening_get_next(seq, NULL);
2228                 while (offset-- && rc)
2229                         rc = listening_get_next(seq, rc);
2230                 if (rc)
2231                         break;
2232                 st->bucket = 0;
2233                 /* Fallthrough */
2234         case TCP_SEQ_STATE_ESTABLISHED:
2235         case TCP_SEQ_STATE_TIME_WAIT:
2236                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2237                 if (st->bucket > tcp_hashinfo.ehash_mask)
2238                         break;
2239                 rc = established_get_first(seq);
2240                 while (offset-- && rc)
2241                         rc = established_get_next(seq, rc);
2242         }
2243
2244         st->num = orig_num;
2245
2246         return rc;
2247 }
2248
2249 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2250 {
2251         struct tcp_iter_state *st = seq->private;
2252         void *rc;
2253
2254         if (*pos && *pos == st->last_pos) {
2255                 rc = tcp_seek_last_pos(seq);
2256                 if (rc)
2257                         goto out;
2258         }
2259
2260         st->state = TCP_SEQ_STATE_LISTENING;
2261         st->num = 0;
2262         st->bucket = 0;
2263         st->offset = 0;
2264         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2265
2266 out:
2267         st->last_pos = *pos;
2268         return rc;
2269 }
2270
2271 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2272 {
2273         struct tcp_iter_state *st = seq->private;
2274         void *rc = NULL;
2275
2276         if (v == SEQ_START_TOKEN) {
2277                 rc = tcp_get_idx(seq, 0);
2278                 goto out;
2279         }
2280
2281         switch (st->state) {
2282         case TCP_SEQ_STATE_OPENREQ:
2283         case TCP_SEQ_STATE_LISTENING:
2284                 rc = listening_get_next(seq, v);
2285                 if (!rc) {
2286                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2287                         st->bucket = 0;
2288                         st->offset = 0;
2289                         rc        = established_get_first(seq);
2290                 }
2291                 break;
2292         case TCP_SEQ_STATE_ESTABLISHED:
2293         case TCP_SEQ_STATE_TIME_WAIT:
2294                 rc = established_get_next(seq, v);
2295                 break;
2296         }
2297 out:
2298         ++*pos;
2299         st->last_pos = *pos;
2300         return rc;
2301 }
2302
2303 static void tcp_seq_stop(struct seq_file *seq, void *v)
2304 {
2305         struct tcp_iter_state *st = seq->private;
2306
2307         switch (st->state) {
2308         case TCP_SEQ_STATE_OPENREQ:
2309                 if (v) {
2310                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2311                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2312                 }
2313         case TCP_SEQ_STATE_LISTENING:
2314                 if (v != SEQ_START_TOKEN)
2315                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2316                 break;
2317         case TCP_SEQ_STATE_TIME_WAIT:
2318         case TCP_SEQ_STATE_ESTABLISHED:
2319                 if (v)
2320                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2321                 break;
2322         }
2323 }
2324
2325 int tcp_seq_open(struct inode *inode, struct file *file)
2326 {
2327         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2328         struct tcp_iter_state *s;
2329         int err;
2330
2331         err = seq_open_net(inode, file, &afinfo->seq_ops,
2332                           sizeof(struct tcp_iter_state));
2333         if (err < 0)
2334                 return err;
2335
2336         s = ((struct seq_file *)file->private_data)->private;
2337         s->family               = afinfo->family;
2338         s->last_pos             = 0;
2339         return 0;
2340 }
2341 EXPORT_SYMBOL(tcp_seq_open);
2342
2343 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2344 {
2345         int rc = 0;
2346         struct proc_dir_entry *p;
2347
2348         afinfo->seq_ops.start           = tcp_seq_start;
2349         afinfo->seq_ops.next            = tcp_seq_next;
2350         afinfo->seq_ops.stop            = tcp_seq_stop;
2351
2352         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2353                              afinfo->seq_fops, afinfo);
2354         if (!p)
2355                 rc = -ENOMEM;
2356         return rc;
2357 }
2358 EXPORT_SYMBOL(tcp_proc_register);
2359
2360 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2361 {
2362         proc_net_remove(net, afinfo->name);
2363 }
2364 EXPORT_SYMBOL(tcp_proc_unregister);
2365
2366 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2367                          struct seq_file *f, int i, int uid, int *len)
2368 {
2369         const struct inet_request_sock *ireq = inet_rsk(req);
2370         int ttd = req->expires - jiffies;
2371
2372         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2373                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2374                 i,
2375                 ireq->loc_addr,
2376                 ntohs(inet_sk(sk)->inet_sport),
2377                 ireq->rmt_addr,
2378                 ntohs(ireq->rmt_port),
2379                 TCP_SYN_RECV,
2380                 0, 0, /* could print option size, but that is af dependent. */
2381                 1,    /* timers active (only the expire timer) */
2382                 jiffies_to_clock_t(ttd),
2383                 req->retrans,
2384                 uid,
2385                 0,  /* non standard timer */
2386                 0, /* open_requests have no inode */
2387                 atomic_read(&sk->sk_refcnt),
2388                 req,
2389                 len);
2390 }
2391
2392 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2393 {
2394         int timer_active;
2395         unsigned long timer_expires;
2396         const struct tcp_sock *tp = tcp_sk(sk);
2397         const struct inet_connection_sock *icsk = inet_csk(sk);
2398         const struct inet_sock *inet = inet_sk(sk);
2399         __be32 dest = inet->inet_daddr;
2400         __be32 src = inet->inet_rcv_saddr;
2401         __u16 destp = ntohs(inet->inet_dport);
2402         __u16 srcp = ntohs(inet->inet_sport);
2403         int rx_queue;
2404
2405         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2406                 timer_active    = 1;
2407                 timer_expires   = icsk->icsk_timeout;
2408         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2409                 timer_active    = 4;
2410                 timer_expires   = icsk->icsk_timeout;
2411         } else if (timer_pending(&sk->sk_timer)) {
2412                 timer_active    = 2;
2413                 timer_expires   = sk->sk_timer.expires;
2414         } else {
2415                 timer_active    = 0;
2416                 timer_expires = jiffies;
2417         }
2418
2419         if (sk->sk_state == TCP_LISTEN)
2420                 rx_queue = sk->sk_ack_backlog;
2421         else
2422                 /*
2423                  * because we dont lock socket, we might find a transient negative value
2424                  */
2425                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2426
2427         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2428                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2429                 i, src, srcp, dest, destp, sk->sk_state,
2430                 tp->write_seq - tp->snd_una,
2431                 rx_queue,
2432                 timer_active,
2433                 jiffies_to_clock_t(timer_expires - jiffies),
2434                 icsk->icsk_retransmits,
2435                 sock_i_uid(sk),
2436                 icsk->icsk_probes_out,
2437                 sock_i_ino(sk),
2438                 atomic_read(&sk->sk_refcnt), sk,
2439                 jiffies_to_clock_t(icsk->icsk_rto),
2440                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2441                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2442                 tp->snd_cwnd,
2443                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2444                 len);
2445 }
2446
2447 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2448                                struct seq_file *f, int i, int *len)
2449 {
2450         __be32 dest, src;
2451         __u16 destp, srcp;
2452         int ttd = tw->tw_ttd - jiffies;
2453
2454         if (ttd < 0)
2455                 ttd = 0;
2456
2457         dest  = tw->tw_daddr;
2458         src   = tw->tw_rcv_saddr;
2459         destp = ntohs(tw->tw_dport);
2460         srcp  = ntohs(tw->tw_sport);
2461
2462         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2463                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2464                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2465                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2466                 atomic_read(&tw->tw_refcnt), tw, len);
2467 }
2468
2469 #define TMPSZ 150
2470
2471 static int tcp4_seq_show(struct seq_file *seq, void *v)
2472 {
2473         struct tcp_iter_state *st;
2474         int len;
2475
2476         if (v == SEQ_START_TOKEN) {
2477                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2478                            "  sl  local_address rem_address   st tx_queue "
2479                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2480                            "inode");
2481                 goto out;
2482         }
2483         st = seq->private;
2484
2485         switch (st->state) {
2486         case TCP_SEQ_STATE_LISTENING:
2487         case TCP_SEQ_STATE_ESTABLISHED:
2488                 get_tcp4_sock(v, seq, st->num, &len);
2489                 break;
2490         case TCP_SEQ_STATE_OPENREQ:
2491                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2492                 break;
2493         case TCP_SEQ_STATE_TIME_WAIT:
2494                 get_timewait4_sock(v, seq, st->num, &len);
2495                 break;
2496         }
2497         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2498 out:
2499         return 0;
2500 }
2501
2502 static const struct file_operations tcp_afinfo_seq_fops = {
2503         .owner   = THIS_MODULE,
2504         .open    = tcp_seq_open,
2505         .read    = seq_read,
2506         .llseek  = seq_lseek,
2507         .release = seq_release_net
2508 };
2509
2510 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2511         .name           = "tcp",
2512         .family         = AF_INET,
2513         .seq_fops       = &tcp_afinfo_seq_fops,
2514         .seq_ops        = {
2515                 .show           = tcp4_seq_show,
2516         },
2517 };
2518
2519 static int __net_init tcp4_proc_init_net(struct net *net)
2520 {
2521         return tcp_proc_register(net, &tcp4_seq_afinfo);
2522 }
2523
2524 static void __net_exit tcp4_proc_exit_net(struct net *net)
2525 {
2526         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2527 }
2528
2529 static struct pernet_operations tcp4_net_ops = {
2530         .init = tcp4_proc_init_net,
2531         .exit = tcp4_proc_exit_net,
2532 };
2533
2534 int __init tcp4_proc_init(void)
2535 {
2536         return register_pernet_subsys(&tcp4_net_ops);
2537 }
2538
2539 void tcp4_proc_exit(void)
2540 {
2541         unregister_pernet_subsys(&tcp4_net_ops);
2542 }
2543 #endif /* CONFIG_PROC_FS */
2544
2545 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2546 {
2547         const struct iphdr *iph = skb_gro_network_header(skb);
2548
2549         switch (skb->ip_summed) {
2550         case CHECKSUM_COMPLETE:
2551                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2552                                   skb->csum)) {
2553                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2554                         break;
2555                 }
2556
2557                 /* fall through */
2558         case CHECKSUM_NONE:
2559                 NAPI_GRO_CB(skb)->flush = 1;
2560                 return NULL;
2561         }
2562
2563         return tcp_gro_receive(head, skb);
2564 }
2565
2566 int tcp4_gro_complete(struct sk_buff *skb)
2567 {
2568         const struct iphdr *iph = ip_hdr(skb);
2569         struct tcphdr *th = tcp_hdr(skb);
2570
2571         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2572                                   iph->saddr, iph->daddr, 0);
2573         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2574
2575         return tcp_gro_complete(skb);
2576 }
2577
2578 struct proto tcp_prot = {
2579         .name                   = "TCP",
2580         .owner                  = THIS_MODULE,
2581         .close                  = tcp_close,
2582         .connect                = tcp_v4_connect,
2583         .disconnect             = tcp_disconnect,
2584         .accept                 = inet_csk_accept,
2585         .ioctl                  = tcp_ioctl,
2586         .init                   = tcp_v4_init_sock,
2587         .destroy                = tcp_v4_destroy_sock,
2588         .shutdown               = tcp_shutdown,
2589         .setsockopt             = tcp_setsockopt,
2590         .getsockopt             = tcp_getsockopt,
2591         .recvmsg                = tcp_recvmsg,
2592         .sendmsg                = tcp_sendmsg,
2593         .sendpage               = tcp_sendpage,
2594         .backlog_rcv            = tcp_v4_do_rcv,
2595         .release_cb             = tcp_release_cb,
2596         .hash                   = inet_hash,
2597         .unhash                 = inet_unhash,
2598         .get_port               = inet_csk_get_port,
2599         .enter_memory_pressure  = tcp_enter_memory_pressure,
2600         .sockets_allocated      = &tcp_sockets_allocated,
2601         .orphan_count           = &tcp_orphan_count,
2602         .memory_allocated       = &tcp_memory_allocated,
2603         .memory_pressure        = &tcp_memory_pressure,
2604         .sysctl_wmem            = sysctl_tcp_wmem,
2605         .sysctl_rmem            = sysctl_tcp_rmem,
2606         .max_header             = MAX_TCP_HEADER,
2607         .obj_size               = sizeof(struct tcp_sock),
2608         .slab_flags             = SLAB_DESTROY_BY_RCU,
2609         .twsk_prot              = &tcp_timewait_sock_ops,
2610         .rsk_prot               = &tcp_request_sock_ops,
2611         .h.hashinfo             = &tcp_hashinfo,
2612         .no_autobind            = true,
2613 #ifdef CONFIG_COMPAT
2614         .compat_setsockopt      = compat_tcp_setsockopt,
2615         .compat_getsockopt      = compat_tcp_getsockopt,
2616 #endif
2617 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2618         .init_cgroup            = tcp_init_cgroup,
2619         .destroy_cgroup         = tcp_destroy_cgroup,
2620         .proto_cgroup           = tcp_proto_cgroup,
2621 #endif
2622 };
2623 EXPORT_SYMBOL(tcp_prot);
2624
2625 static int __net_init tcp_sk_init(struct net *net)
2626 {
2627         return 0;
2628 }
2629
2630 static void __net_exit tcp_sk_exit(struct net *net)
2631 {
2632 }
2633
2634 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2635 {
2636         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2637 }
2638
2639 static struct pernet_operations __net_initdata tcp_sk_ops = {
2640        .init       = tcp_sk_init,
2641        .exit       = tcp_sk_exit,
2642        .exit_batch = tcp_sk_exit_batch,
2643 };
2644
2645 void __init tcp_v4_init(void)
2646 {
2647         inet_hashinfo_init(&tcp_hashinfo);
2648         if (register_pernet_subsys(&tcp_sk_ops))
2649                 panic("Failed to create the TCP control socket.\n");
2650 }