net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 434                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653
 654         net = dev_net(skb_dst(skb)->dev);
 655         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 656                       &arg, arg.iov[0].iov_len);
 657
 658         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 659         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 660 }
 661
 662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 663    outside socket context is ugly, certainly. What can I do?
 664  */
 665
 666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 667                             u32 win, u32 ts, int oif,
 668                             struct tcp_md5sig_key *key,
 669                             int reply_flags)
 670 {
 671         struct tcphdr *th = tcp_hdr(skb);
 672         struct {
 673                 struct tcphdr th;
 674                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 675 #ifdef CONFIG_TCP_MD5SIG
 676                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 677 #endif
 678                         ];
 679         } rep;
 680         struct ip_reply_arg arg;
 681         struct net *net = dev_net(skb_dst(skb)->dev);
 682
 683         memset(&rep.th, 0, sizeof(struct tcphdr));
 684         memset(&arg, 0, sizeof(arg));
 685
 686         arg.iov[0].iov_base = (unsigned char *)&rep;
 687         arg.iov[0].iov_len  = sizeof(rep.th);
 688         if (ts) {
 689                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 690                                    (TCPOPT_TIMESTAMP << 8) |
 691                                    TCPOLEN_TIMESTAMP);
 692                 rep.opt[1] = htonl(tcp_time_stamp);
 693                 rep.opt[2] = htonl(ts);
 694                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 695         }
 696
 697         /* Swap the send and the receive. */
 698         rep.th.dest    = th->source;
 699         rep.th.source  = th->dest;
 700         rep.th.doff    = arg.iov[0].iov_len / 4;
 701         rep.th.seq     = htonl(seq);
 702         rep.th.ack_seq = htonl(ack);
 703         rep.th.ack     = 1;
 704         rep.th.window  = htons(win);
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707         if (key) {
 708                 int offset = (ts) ? 3 : 0;
 709
 710                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 711                                           (TCPOPT_NOP << 16) |
 712                                           (TCPOPT_MD5SIG << 8) |
 713                                           TCPOLEN_MD5SIG);
 714                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 715                 rep.th.doff = arg.iov[0].iov_len/4;
 716
 717                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 718                                     key, ip_hdr(skb)->saddr,
 719                                     ip_hdr(skb)->daddr, &rep.th);
 720         }
 721 #endif
 722         arg.flags = reply_flags;
 723         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 724                                       ip_hdr(skb)->saddr, /* XXX */
 725                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 726         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 727         if (oif)
 728                 arg.bound_dev_if = oif;
 729
 730         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 731                       &arg, arg.iov[0].iov_len);
 732
 733         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 734 }
 735
 736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 737 {
 738         struct inet_timewait_sock *tw = inet_twsk(sk);
 739         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 740
 741         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 742                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 743                         tcptw->tw_ts_recent,
 744                         tw->tw_bound_dev_if,
 745                         tcp_twsk_md5_key(tcptw),
 746                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 747                         );
 748
 749         inet_twsk_put(tw);
 750 }
 751
 752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 753                                   struct request_sock *req)
 754 {
 755         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 756                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 757                         req->ts_recent,
 758                         0,
 759                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 760                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 761 }
 762
 763 /*
 764  *      Send a SYN-ACK after having received a SYN.
 765  *      This still operates on a request_sock only, not on a big
 766  *      socket.
 767  */
 768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 769                               struct request_sock *req,
 770                               struct request_values *rvp)
 771 {
 772         const struct inet_request_sock *ireq = inet_rsk(req);
 773         struct flowi4 fl4;
 774         int err = -1;
 775         struct sk_buff * skb;
 776
 777         /* First, grab a route. */
 778         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 779                 return -1;
 780
 781         skb = tcp_make_synack(sk, dst, req, rvp);
 782
 783         if (skb) {
 784                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 785
 786                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 787                                             ireq->rmt_addr,
 788                                             ireq->opt);
 789                 err = net_xmit_eval(err);
 790         }
 791
 792         dst_release(dst);
 793         return err;
 794 }
 795
 796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 797                               struct request_values *rvp)
 798 {
 799         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 800         return tcp_v4_send_synack(sk, NULL, req, rvp);
 801 }
 802
 803 /*
 804  *      IPv4 request_sock destructor.
 805  */
 806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 807 {
 808         kfree(inet_rsk(req)->opt);
 809 }
 810
 811 static void syn_flood_warning(const struct sk_buff *skb)
 812 {
 813         const char *msg;
 814
 815 #ifdef CONFIG_SYN_COOKIES
 816         if (sysctl_tcp_syncookies)
 817                 msg = "Sending cookies";
 818         else
 819 #endif
 820                 msg = "Dropping request";
 821
 822         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 823                                 ntohs(tcp_hdr(skb)->dest), msg);
 824 }
 825
 826 /*
 827  * Save and compile IPv4 options into the request_sock if needed.
 828  */
 829 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 830                                                   struct sk_buff *skb)
 831 {
 832         const struct ip_options *opt = &(IPCB(skb)->opt);
 833         struct ip_options_rcu *dopt = NULL;
 834
 835         if (opt && opt->optlen) {
 836                 int opt_size = sizeof(*dopt) + opt->optlen;
 837
 838                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 839                 if (dopt) {
 840                         if (ip_options_echo(&dopt->opt, skb)) {
 841                                 kfree(dopt);
 842                                 dopt = NULL;
 843                         }
 844                 }
 845         }
 846         return dopt;
 847 }
 848
 849 #ifdef CONFIG_TCP_MD5SIG
 850 /*
 851  * RFC2385 MD5 checksumming requires a mapping of
 852  * IP address->MD5 Key.
 853  * We need to maintain these in the sk structure.
 854  */
 855
 856 /* Find the Key structure for an address.  */
 857 static struct tcp_md5sig_key *
 858                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 859 {
 860         struct tcp_sock *tp = tcp_sk(sk);
 861         int i;
 862
 863         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 864                 return NULL;
 865         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 866                 if (tp->md5sig_info->keys4[i].addr == addr)
 867                         return &tp->md5sig_info->keys4[i].base;
 868         }
 869         return NULL;
 870 }
 871
 872 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 873                                          struct sock *addr_sk)
 874 {
 875         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 876 }
 877 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 878
 879 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 880                                                       struct request_sock *req)
 881 {
 882         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 883 }
 884
 885 /* This can be called on a newly created socket, from other files */
 886 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 887                       u8 *newkey, u8 newkeylen)
 888 {
 889         /* Add Key to the list */
 890         struct tcp_md5sig_key *key;
 891         struct tcp_sock *tp = tcp_sk(sk);
 892         struct tcp4_md5sig_key *keys;
 893
 894         key = tcp_v4_md5_do_lookup(sk, addr);
 895         if (key) {
 896                 /* Pre-existing entry - just update that one. */
 897                 kfree(key->key);
 898                 key->key = newkey;
 899                 key->keylen = newkeylen;
 900         } else {
 901                 struct tcp_md5sig_info *md5sig;
 902
 903                 if (!tp->md5sig_info) {
 904                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 905                                                   GFP_ATOMIC);
 906                         if (!tp->md5sig_info) {
 907                                 kfree(newkey);
 908                                 return -ENOMEM;
 909                         }
 910                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 911                 }
 912                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 913                         kfree(newkey);
 914                         return -ENOMEM;
 915                 }
 916                 md5sig = tp->md5sig_info;
 917
 918                 if (md5sig->alloced4 == md5sig->entries4) {
 919                         keys = kmalloc((sizeof(*keys) *
 920                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 921                         if (!keys) {
 922                                 kfree(newkey);
 923                                 tcp_free_md5sig_pool();
 924                                 return -ENOMEM;
 925                         }
 926
 927                         if (md5sig->entries4)
 928                                 memcpy(keys, md5sig->keys4,
 929                                        sizeof(*keys) * md5sig->entries4);
 930
 931                         /* Free old key list, and reference new one */
 932                         kfree(md5sig->keys4);
 933                         md5sig->keys4 = keys;
 934                         md5sig->alloced4++;
 935                 }
 936                 md5sig->entries4++;
 937                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 938                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 939                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 940         }
 941         return 0;
 942 }
 943 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 944
 945 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 946                                u8 *newkey, u8 newkeylen)
 947 {
 948         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 949                                  newkey, newkeylen);
 950 }
 951
 952 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 953 {
 954         struct tcp_sock *tp = tcp_sk(sk);
 955         int i;
 956
 957         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 958                 if (tp->md5sig_info->keys4[i].addr == addr) {
 959                         /* Free the key */
 960                         kfree(tp->md5sig_info->keys4[i].base.key);
 961                         tp->md5sig_info->entries4--;
 962
 963                         if (tp->md5sig_info->entries4 == 0) {
 964                                 kfree(tp->md5sig_info->keys4);
 965                                 tp->md5sig_info->keys4 = NULL;
 966                                 tp->md5sig_info->alloced4 = 0;
 967                         } else if (tp->md5sig_info->entries4 != i) {
 968                                 /* Need to do some manipulation */
 969                                 memmove(&tp->md5sig_info->keys4[i],
 970                                         &tp->md5sig_info->keys4[i+1],
 971                                         (tp->md5sig_info->entries4 - i) *
 972                                          sizeof(struct tcp4_md5sig_key));
 973                         }
 974                         tcp_free_md5sig_pool();
 975                         return 0;
 976                 }
 977         }
 978         return -ENOENT;
 979 }
 980 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 981
 982 static void tcp_v4_clear_md5_list(struct sock *sk)
 983 {
 984         struct tcp_sock *tp = tcp_sk(sk);
 985
 986         /* Free each key, then the set of key keys,
 987          * the crypto element, and then decrement our
 988          * hold on the last resort crypto.
 989          */
 990         if (tp->md5sig_info->entries4) {
 991                 int i;
 992                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 993                         kfree(tp->md5sig_info->keys4[i].base.key);
 994                 tp->md5sig_info->entries4 = 0;
 995                 tcp_free_md5sig_pool();
 996         }
 997         if (tp->md5sig_info->keys4) {
 998                 kfree(tp->md5sig_info->keys4);
 999                 tp->md5sig_info->keys4 = NULL;
1000                 tp->md5sig_info->alloced4  = 0;
1001         }
1002 }
1003
1004 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1005                                  int optlen)
1006 {
1007         struct tcp_md5sig cmd;
1008         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1009         u8 *newkey;
1010
1011         if (optlen < sizeof(cmd))
1012                 return -EINVAL;
1013
1014         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1015                 return -EFAULT;
1016
1017         if (sin->sin_family != AF_INET)
1018                 return -EINVAL;
1019
1020         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1021                 if (!tcp_sk(sk)->md5sig_info)
1022                         return -ENOENT;
1023                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1024         }
1025
1026         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1027                 return -EINVAL;
1028
1029         if (!tcp_sk(sk)->md5sig_info) {
1030                 struct tcp_sock *tp = tcp_sk(sk);
1031                 struct tcp_md5sig_info *p;
1032
1033                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1034                 if (!p)
1035                         return -EINVAL;
1036
1037                 tp->md5sig_info = p;
1038                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1039         }
1040
1041         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1042         if (!newkey)
1043                 return -ENOMEM;
1044         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1045                                  newkey, cmd.tcpm_keylen);
1046 }
1047
1048 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1049                                         __be32 daddr, __be32 saddr, int nbytes)
1050 {
1051         struct tcp4_pseudohdr *bp;
1052         struct scatterlist sg;
1053
1054         bp = &hp->md5_blk.ip4;
1055
1056         /*
1057          * 1. the TCP pseudo-header (in the order: source IP address,
1058          * destination IP address, zero-padded protocol number, and
1059          * segment length)
1060          */
1061         bp->saddr = saddr;
1062         bp->daddr = daddr;
1063         bp->pad = 0;
1064         bp->protocol = IPPROTO_TCP;
1065         bp->len = cpu_to_be16(nbytes);
1066
1067         sg_init_one(&sg, bp, sizeof(*bp));
1068         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1069 }
1070
1071 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1072                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1073 {
1074         struct tcp_md5sig_pool *hp;
1075         struct hash_desc *desc;
1076
1077         hp = tcp_get_md5sig_pool();
1078         if (!hp)
1079                 goto clear_hash_noput;
1080         desc = &hp->md5_desc;
1081
1082         if (crypto_hash_init(desc))
1083                 goto clear_hash;
1084         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1085                 goto clear_hash;
1086         if (tcp_md5_hash_header(hp, th))
1087                 goto clear_hash;
1088         if (tcp_md5_hash_key(hp, key))
1089                 goto clear_hash;
1090         if (crypto_hash_final(desc, md5_hash))
1091                 goto clear_hash;
1092
1093         tcp_put_md5sig_pool();
1094         return 0;
1095
1096 clear_hash:
1097         tcp_put_md5sig_pool();
1098 clear_hash_noput:
1099         memset(md5_hash, 0, 16);
1100         return 1;
1101 }
1102
1103 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1104                         struct sock *sk, struct request_sock *req,
1105                         struct sk_buff *skb)
1106 {
1107         struct tcp_md5sig_pool *hp;
1108         struct hash_desc *desc;
1109         struct tcphdr *th = tcp_hdr(skb);
1110         __be32 saddr, daddr;
1111
1112         if (sk) {
1113                 saddr = inet_sk(sk)->inet_saddr;
1114                 daddr = inet_sk(sk)->inet_daddr;
1115         } else if (req) {
1116                 saddr = inet_rsk(req)->loc_addr;
1117                 daddr = inet_rsk(req)->rmt_addr;
1118         } else {
1119                 const struct iphdr *iph = ip_hdr(skb);
1120                 saddr = iph->saddr;
1121                 daddr = iph->daddr;
1122         }
1123
1124         hp = tcp_get_md5sig_pool();
1125         if (!hp)
1126                 goto clear_hash_noput;
1127         desc = &hp->md5_desc;
1128
1129         if (crypto_hash_init(desc))
1130                 goto clear_hash;
1131
1132         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1133                 goto clear_hash;
1134         if (tcp_md5_hash_header(hp, th))
1135                 goto clear_hash;
1136         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1137                 goto clear_hash;
1138         if (tcp_md5_hash_key(hp, key))
1139                 goto clear_hash;
1140         if (crypto_hash_final(desc, md5_hash))
1141                 goto clear_hash;
1142
1143         tcp_put_md5sig_pool();
1144         return 0;
1145
1146 clear_hash:
1147         tcp_put_md5sig_pool();
1148 clear_hash_noput:
1149         memset(md5_hash, 0, 16);
1150         return 1;
1151 }
1152 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1153
1154 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1155 {
1156         /*
1157          * This gets called for each TCP segment that arrives
1158          * so we want to be efficient.
1159          * We have 3 drop cases:
1160          * o No MD5 hash and one expected.
1161          * o MD5 hash and we're not expecting one.
1162          * o MD5 hash and its wrong.
1163          */
1164         __u8 *hash_location = NULL;
1165         struct tcp_md5sig_key *hash_expected;
1166         const struct iphdr *iph = ip_hdr(skb);
1167         struct tcphdr *th = tcp_hdr(skb);
1168         int genhash;
1169         unsigned char newhash[16];
1170
1171         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1172         hash_location = tcp_parse_md5sig_option(th);
1173
1174         /* We've parsed the options - do we have a hash? */
1175         if (!hash_expected && !hash_location)
1176                 return 0;
1177
1178         if (hash_expected && !hash_location) {
1179                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1180                 return 1;
1181         }
1182
1183         if (!hash_expected && hash_location) {
1184                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1185                 return 1;
1186         }
1187
1188         /* Okay, so this is hash_expected and hash_location -
1189          * so we need to calculate the checksum.
1190          */
1191         genhash = tcp_v4_md5_hash_skb(newhash,
1192                                       hash_expected,
1193                                       NULL, NULL, skb);
1194
1195         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1196                 if (net_ratelimit()) {
1197                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1198                                &iph->saddr, ntohs(th->source),
1199                                &iph->daddr, ntohs(th->dest),
1200                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1201                 }
1202                 return 1;
1203         }
1204         return 0;
1205 }
1206
1207 #endif
1208
1209 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1210         .family         =       PF_INET,
1211         .obj_size       =       sizeof(struct tcp_request_sock),
1212         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1213         .send_ack       =       tcp_v4_reqsk_send_ack,
1214         .destructor     =       tcp_v4_reqsk_destructor,
1215         .send_reset     =       tcp_v4_send_reset,
1216         .syn_ack_timeout =      tcp_syn_ack_timeout,
1217 };
1218
1219 #ifdef CONFIG_TCP_MD5SIG
1220 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1222         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1223 };
1224 #endif
1225
1226 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1227 {
1228         struct tcp_extend_values tmp_ext;
1229         struct tcp_options_received tmp_opt;
1230         u8 *hash_location;
1231         struct request_sock *req;
1232         struct inet_request_sock *ireq;
1233         struct tcp_sock *tp = tcp_sk(sk);
1234         struct dst_entry *dst = NULL;
1235         __be32 saddr = ip_hdr(skb)->saddr;
1236         __be32 daddr = ip_hdr(skb)->daddr;
1237         __u32 isn = TCP_SKB_CB(skb)->when;
1238 #ifdef CONFIG_SYN_COOKIES
1239         int want_cookie = 0;
1240 #else
1241 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1242 #endif
1243
1244         /* Never answer to SYNs send to broadcast or multicast */
1245         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1246                 goto drop;
1247
1248         /* TW buckets are converted to open requests without
1249          * limitations, they conserve resources and peer is
1250          * evidently real one.
1251          */
1252         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1253                 if (net_ratelimit())
1254                         syn_flood_warning(skb);
1255 #ifdef CONFIG_SYN_COOKIES
1256                 if (sysctl_tcp_syncookies) {
1257                         want_cookie = 1;
1258                 } else
1259 #endif
1260                 goto drop;
1261         }
1262
1263         /* Accept backlog is full. If we have already queued enough
1264          * of warm entries in syn queue, drop request. It is better than
1265          * clogging syn queue with openreqs with exponentially increasing
1266          * timeout.
1267          */
1268         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1269                 goto drop;
1270
1271         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1272         if (!req)
1273                 goto drop;
1274
1275 #ifdef CONFIG_TCP_MD5SIG
1276         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1277 #endif
1278
1279         tcp_clear_options(&tmp_opt);
1280         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1281         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1282         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1283
1284         if (tmp_opt.cookie_plus > 0 &&
1285             tmp_opt.saw_tstamp &&
1286             !tp->rx_opt.cookie_out_never &&
1287             (sysctl_tcp_cookie_size > 0 ||
1288              (tp->cookie_values != NULL &&
1289               tp->cookie_values->cookie_desired > 0))) {
1290                 u8 *c;
1291                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1292                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1293
1294                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1295                         goto drop_and_release;
1296
1297                 /* Secret recipe starts with IP addresses */
1298                 *mess++ ^= (__force u32)daddr;
1299                 *mess++ ^= (__force u32)saddr;
1300
1301                 /* plus variable length Initiator Cookie */
1302                 c = (u8 *)mess;
1303                 while (l-- > 0)
1304                         *c++ ^= *hash_location++;
1305
1306 #ifdef CONFIG_SYN_COOKIES
1307                 want_cookie = 0;        /* not our kind of cookie */
1308 #endif
1309                 tmp_ext.cookie_out_never = 0; /* false */
1310                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1311         } else if (!tp->rx_opt.cookie_in_always) {
1312                 /* redundant indications, but ensure initialization. */
1313                 tmp_ext.cookie_out_never = 1; /* true */
1314                 tmp_ext.cookie_plus = 0;
1315         } else {
1316                 goto drop_and_release;
1317         }
1318         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1319
1320         if (want_cookie && !tmp_opt.saw_tstamp)
1321                 tcp_clear_options(&tmp_opt);
1322
1323         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1324         tcp_openreq_init(req, &tmp_opt, skb);
1325
1326         ireq = inet_rsk(req);
1327         ireq->loc_addr = daddr;
1328         ireq->rmt_addr = saddr;
1329         ireq->no_srccheck = inet_sk(sk)->transparent;
1330         ireq->opt = tcp_v4_save_options(sk, skb);
1331
1332         if (security_inet_conn_request(sk, skb, req))
1333                 goto drop_and_free;
1334
1335         if (!want_cookie || tmp_opt.tstamp_ok)
1336                 TCP_ECN_create_request(req, tcp_hdr(skb));
1337
1338         if (want_cookie) {
1339                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1340                 req->cookie_ts = tmp_opt.tstamp_ok;
1341         } else if (!isn) {
1342                 struct inet_peer *peer = NULL;
1343                 struct flowi4 fl4;
1344
1345                 /* VJ's idea. We save last timestamp seen
1346                  * from the destination in peer table, when entering
1347                  * state TIME-WAIT, and check against it before
1348                  * accepting new connection request.
1349                  *
1350                  * If "isn" is not zero, this request hit alive
1351                  * timewait bucket, so that all the necessary checks
1352                  * are made in the function processing timewait state.
1353                  */
1354                 if (tmp_opt.saw_tstamp &&
1355                     tcp_death_row.sysctl_tw_recycle &&
1356                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1357                     fl4.daddr == saddr &&
1358                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1359                         inet_peer_refcheck(peer);
1360                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1361                             (s32)(peer->tcp_ts - req->ts_recent) >
1362                                                         TCP_PAWS_WINDOW) {
1363                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1364                                 goto drop_and_release;
1365                         }
1366                 }
1367                 /* Kill the following clause, if you dislike this way. */
1368                 else if (!sysctl_tcp_syncookies &&
1369                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1370                           (sysctl_max_syn_backlog >> 2)) &&
1371                          (!peer || !peer->tcp_ts_stamp) &&
1372                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1373                         /* Without syncookies last quarter of
1374                          * backlog is filled with destinations,
1375                          * proven to be alive.
1376                          * It means that we continue to communicate
1377                          * to destinations, already remembered
1378                          * to the moment of synflood.
1379                          */
1380                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1381                                        &saddr, ntohs(tcp_hdr(skb)->source));
1382                         goto drop_and_release;
1383                 }
1384
1385                 isn = tcp_v4_init_sequence(skb);
1386         }
1387         tcp_rsk(req)->snt_isn = isn;
1388         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1389
1390         if (tcp_v4_send_synack(sk, dst, req,
1391                                (struct request_values *)&tmp_ext) ||
1392             want_cookie)
1393                 goto drop_and_free;
1394
1395         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1396         return 0;
1397
1398 drop_and_release:
1399         dst_release(dst);
1400 drop_and_free:
1401         reqsk_free(req);
1402 drop:
1403         return 0;
1404 }
1405 EXPORT_SYMBOL(tcp_v4_conn_request);
1406
1407
1408 /*
1409  * The three way handshake has completed - we got a valid synack -
1410  * now create the new socket.
1411  */
1412 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1413                                   struct request_sock *req,
1414                                   struct dst_entry *dst)
1415 {
1416         struct inet_request_sock *ireq;
1417         struct inet_sock *newinet;
1418         struct tcp_sock *newtp;
1419         struct sock *newsk;
1420 #ifdef CONFIG_TCP_MD5SIG
1421         struct tcp_md5sig_key *key;
1422 #endif
1423         struct ip_options_rcu *inet_opt;
1424
1425         if (sk_acceptq_is_full(sk))
1426                 goto exit_overflow;
1427
1428         newsk = tcp_create_openreq_child(sk, req, skb);
1429         if (!newsk)
1430                 goto exit_nonewsk;
1431
1432         newsk->sk_gso_type = SKB_GSO_TCPV4;
1433
1434         newtp                 = tcp_sk(newsk);
1435         newinet               = inet_sk(newsk);
1436         ireq                  = inet_rsk(req);
1437         newinet->inet_daddr   = ireq->rmt_addr;
1438         newinet->inet_rcv_saddr = ireq->loc_addr;
1439         newinet->inet_saddr           = ireq->loc_addr;
1440         inet_opt              = ireq->opt;
1441         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1442         ireq->opt             = NULL;
1443         newinet->mc_index     = inet_iif(skb);
1444         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1445         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1446         if (inet_opt)
1447                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1448         newinet->inet_id = newtp->write_seq ^ jiffies;
1449
1450         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1451                 goto put_and_exit;
1452
1453         sk_setup_caps(newsk, dst);
1454
1455         tcp_mtup_init(newsk);
1456         tcp_sync_mss(newsk, dst_mtu(dst));
1457         newtp->advmss = dst_metric_advmss(dst);
1458         if (tcp_sk(sk)->rx_opt.user_mss &&
1459             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1460                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1461
1462         tcp_initialize_rcv_mss(newsk);
1463         if (tcp_rsk(req)->snt_synack)
1464                 tcp_valid_rtt_meas(newsk,
1465                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1466         newtp->total_retrans = req->retrans;
1467
1468 #ifdef CONFIG_TCP_MD5SIG
1469         /* Copy over the MD5 key from the original socket */
1470         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1471         if (key != NULL) {
1472                 /*
1473                  * We're using one, so create a matching key
1474                  * on the newsk structure. If we fail to get
1475                  * memory, then we end up not copying the key
1476                  * across. Shucks.
1477                  */
1478                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1479                 if (newkey != NULL)
1480                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1481                                           newkey, key->keylen);
1482                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483         }
1484 #endif
1485
1486         if (__inet_inherit_port(sk, newsk) < 0)
1487                 goto put_and_exit;
1488         __inet_hash_nolisten(newsk, NULL);
1489
1490         return newsk;
1491
1492 exit_overflow:
1493         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1494 exit_nonewsk:
1495         dst_release(dst);
1496 exit:
1497         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1498         return NULL;
1499 put_and_exit:
1500         sock_put(newsk);
1501         goto exit;
1502 }
1503 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1504
1505 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1506 {
1507         struct tcphdr *th = tcp_hdr(skb);
1508         const struct iphdr *iph = ip_hdr(skb);
1509         struct sock *nsk;
1510         struct request_sock **prev;
1511         /* Find possible connection requests. */
1512         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1513                                                        iph->saddr, iph->daddr);
1514         if (req)
1515                 return tcp_check_req(sk, skb, req, prev);
1516
1517         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1518                         th->source, iph->daddr, th->dest, inet_iif(skb));
1519
1520         if (nsk) {
1521                 if (nsk->sk_state != TCP_TIME_WAIT) {
1522                         bh_lock_sock(nsk);
1523                         return nsk;
1524                 }
1525                 inet_twsk_put(inet_twsk(nsk));
1526                 return NULL;
1527         }
1528
1529 #ifdef CONFIG_SYN_COOKIES
1530         if (!th->syn)
1531                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1532 #endif
1533         return sk;
1534 }
1535
1536 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1537 {
1538         const struct iphdr *iph = ip_hdr(skb);
1539
1540         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1541                 if (!tcp_v4_check(skb->len, iph->saddr,
1542                                   iph->daddr, skb->csum)) {
1543                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1544                         return 0;
1545                 }
1546         }
1547
1548         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1549                                        skb->len, IPPROTO_TCP, 0);
1550
1551         if (skb->len <= 76) {
1552                 return __skb_checksum_complete(skb);
1553         }
1554         return 0;
1555 }
1556
1557
1558 /* The socket must have it's spinlock held when we get
1559  * here.
1560  *
1561  * We have a potential double-lock case here, so even when
1562  * doing backlog processing we use the BH locking scheme.
1563  * This is because we cannot sleep with the original spinlock
1564  * held.
1565  */
1566 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1567 {
1568         struct sock *rsk;
1569 #ifdef CONFIG_TCP_MD5SIG
1570         /*
1571          * We really want to reject the packet as early as possible
1572          * if:
1573          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1574          *  o There is an MD5 option and we're not expecting one
1575          */
1576         if (tcp_v4_inbound_md5_hash(sk, skb))
1577                 goto discard;
1578 #endif
1579
1580         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1581                 sock_rps_save_rxhash(sk, skb);
1582                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1583                         rsk = sk;
1584                         goto reset;
1585                 }
1586                 return 0;
1587         }
1588
1589         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1590                 goto csum_err;
1591
1592         if (sk->sk_state == TCP_LISTEN) {
1593                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1594                 if (!nsk)
1595                         goto discard;
1596
1597                 if (nsk != sk) {
1598                         sock_rps_save_rxhash(nsk, skb);
1599                         if (tcp_child_process(sk, nsk, skb)) {
1600                                 rsk = nsk;
1601                                 goto reset;
1602                         }
1603                         return 0;
1604                 }
1605         } else
1606                 sock_rps_save_rxhash(sk, skb);
1607
1608         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1609                 rsk = sk;
1610                 goto reset;
1611         }
1612         return 0;
1613
1614 reset:
1615         tcp_v4_send_reset(rsk, skb);
1616 discard:
1617         kfree_skb(skb);
1618         /* Be careful here. If this function gets more complicated and
1619          * gcc suffers from register pressure on the x86, sk (in %ebx)
1620          * might be destroyed here. This current version compiles correctly,
1621          * but you have been warned.
1622          */
1623         return 0;
1624
1625 csum_err:
1626         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1627         goto discard;
1628 }
1629 EXPORT_SYMBOL(tcp_v4_do_rcv);
1630
1631 /*
1632  *      From tcp_input.c
1633  */
1634
1635 int tcp_v4_rcv(struct sk_buff *skb)
1636 {
1637         const struct iphdr *iph;
1638         struct tcphdr *th;
1639         struct sock *sk;
1640         int ret;
1641         struct net *net = dev_net(skb->dev);
1642
1643         if (skb->pkt_type != PACKET_HOST)
1644                 goto discard_it;
1645
1646         /* Count it even if it's bad */
1647         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1648
1649         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1650                 goto discard_it;
1651
1652         th = tcp_hdr(skb);
1653
1654         if (th->doff < sizeof(struct tcphdr) / 4)
1655                 goto bad_packet;
1656         if (!pskb_may_pull(skb, th->doff * 4))
1657                 goto discard_it;
1658
1659         /* An explanation is required here, I think.
1660          * Packet length and doff are validated by header prediction,
1661          * provided case of th->doff==0 is eliminated.
1662          * So, we defer the checks. */
1663         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1664                 goto bad_packet;
1665
1666         th = tcp_hdr(skb);
1667         iph = ip_hdr(skb);
1668         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1669         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1670                                     skb->len - th->doff * 4);
1671         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1672         TCP_SKB_CB(skb)->when    = 0;
1673         TCP_SKB_CB(skb)->flags   = iph->tos;
1674         TCP_SKB_CB(skb)->sacked  = 0;
1675
1676         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1677         if (!sk)
1678                 goto no_tcp_socket;
1679
1680 process:
1681         if (sk->sk_state == TCP_TIME_WAIT)
1682                 goto do_time_wait;
1683
1684         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1685                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1686                 goto discard_and_relse;
1687         }
1688
1689         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1690                 goto discard_and_relse;
1691         nf_reset(skb);
1692
1693         if (sk_filter(sk, skb))
1694                 goto discard_and_relse;
1695
1696         skb->dev = NULL;
1697
1698         bh_lock_sock_nested(sk);
1699         ret = 0;
1700         if (!sock_owned_by_user(sk)) {
1701 #ifdef CONFIG_NET_DMA
1702                 struct tcp_sock *tp = tcp_sk(sk);
1703                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1704                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1705                 if (tp->ucopy.dma_chan)
1706                         ret = tcp_v4_do_rcv(sk, skb);
1707                 else
1708 #endif
1709                 {
1710                         if (!tcp_prequeue(sk, skb))
1711                                 ret = tcp_v4_do_rcv(sk, skb);
1712                 }
1713         } else if (unlikely(sk_add_backlog(sk, skb))) {
1714                 bh_unlock_sock(sk);
1715                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1716                 goto discard_and_relse;
1717         }
1718         bh_unlock_sock(sk);
1719
1720         sock_put(sk);
1721
1722         return ret;
1723
1724 no_tcp_socket:
1725         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1726                 goto discard_it;
1727
1728         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1729 bad_packet:
1730                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1731         } else {
1732                 tcp_v4_send_reset(NULL, skb);
1733         }
1734
1735 discard_it:
1736         /* Discard frame. */
1737         kfree_skb(skb);
1738         return 0;
1739
1740 discard_and_relse:
1741         sock_put(sk);
1742         goto discard_it;
1743
1744 do_time_wait:
1745         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1746                 inet_twsk_put(inet_twsk(sk));
1747                 goto discard_it;
1748         }
1749
1750         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1751                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1752                 inet_twsk_put(inet_twsk(sk));
1753                 goto discard_it;
1754         }
1755         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1756         case TCP_TW_SYN: {
1757                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1758                                                         &tcp_hashinfo,
1759                                                         iph->daddr, th->dest,
1760                                                         inet_iif(skb));
1761                 if (sk2) {
1762                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1763                         inet_twsk_put(inet_twsk(sk));
1764                         sk = sk2;
1765                         goto process;
1766                 }
1767                 /* Fall through to ACK */
1768         }
1769         case TCP_TW_ACK:
1770                 tcp_v4_timewait_ack(sk, skb);
1771                 break;
1772         case TCP_TW_RST:
1773                 goto no_tcp_socket;
1774         case TCP_TW_SUCCESS:;
1775         }
1776         goto discard_it;
1777 }
1778
1779 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1780 {
1781         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1782         struct inet_sock *inet = inet_sk(sk);
1783         struct inet_peer *peer;
1784
1785         if (!rt ||
1786             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1787                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1788                 *release_it = true;
1789         } else {
1790                 if (!rt->peer)
1791                         rt_bind_peer(rt, inet->inet_daddr, 1);
1792                 peer = rt->peer;
1793                 *release_it = false;
1794         }
1795
1796         return peer;
1797 }
1798 EXPORT_SYMBOL(tcp_v4_get_peer);
1799
1800 void *tcp_v4_tw_get_peer(struct sock *sk)
1801 {
1802         struct inet_timewait_sock *tw = inet_twsk(sk);
1803
1804         return inet_getpeer_v4(tw->tw_daddr, 1);
1805 }
1806 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1807
1808 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1809         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1810         .twsk_unique    = tcp_twsk_unique,
1811         .twsk_destructor= tcp_twsk_destructor,
1812         .twsk_getpeer   = tcp_v4_tw_get_peer,
1813 };
1814
1815 const struct inet_connection_sock_af_ops ipv4_specific = {
1816         .queue_xmit        = ip_queue_xmit,
1817         .send_check        = tcp_v4_send_check,
1818         .rebuild_header    = inet_sk_rebuild_header,
1819         .conn_request      = tcp_v4_conn_request,
1820         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1821         .get_peer          = tcp_v4_get_peer,
1822         .net_header_len    = sizeof(struct iphdr),
1823         .setsockopt        = ip_setsockopt,
1824         .getsockopt        = ip_getsockopt,
1825         .addr2sockaddr     = inet_csk_addr2sockaddr,
1826         .sockaddr_len      = sizeof(struct sockaddr_in),
1827         .bind_conflict     = inet_csk_bind_conflict,
1828 #ifdef CONFIG_COMPAT
1829         .compat_setsockopt = compat_ip_setsockopt,
1830         .compat_getsockopt = compat_ip_getsockopt,
1831 #endif
1832 };
1833 EXPORT_SYMBOL(ipv4_specific);
1834
1835 #ifdef CONFIG_TCP_MD5SIG
1836 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1837         .md5_lookup             = tcp_v4_md5_lookup,
1838         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1839         .md5_add                = tcp_v4_md5_add_func,
1840         .md5_parse              = tcp_v4_parse_md5_keys,
1841 };
1842 #endif
1843
1844 /* NOTE: A lot of things set to zero explicitly by call to
1845  *       sk_alloc() so need not be done here.
1846  */
1847 static int tcp_v4_init_sock(struct sock *sk)
1848 {
1849         struct inet_connection_sock *icsk = inet_csk(sk);
1850         struct tcp_sock *tp = tcp_sk(sk);
1851
1852         skb_queue_head_init(&tp->out_of_order_queue);
1853         tcp_init_xmit_timers(sk);
1854         tcp_prequeue_init(tp);
1855
1856         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1857         tp->mdev = TCP_TIMEOUT_INIT;
1858
1859         /* So many TCP implementations out there (incorrectly) count the
1860          * initial SYN frame in their delayed-ACK and congestion control
1861          * algorithms that we must have the following bandaid to talk
1862          * efficiently to them.  -DaveM
1863          */
1864         tp->snd_cwnd = TCP_INIT_CWND;
1865
1866         /* See draft-stevens-tcpca-spec-01 for discussion of the
1867          * initialization of these values.
1868          */
1869         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1870         tp->snd_cwnd_clamp = ~0;
1871         tp->mss_cache = TCP_MSS_DEFAULT;
1872
1873         tp->reordering = sysctl_tcp_reordering;
1874         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1875
1876         sk->sk_state = TCP_CLOSE;
1877
1878         sk->sk_write_space = sk_stream_write_space;
1879         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1880
1881         icsk->icsk_af_ops = &ipv4_specific;
1882         icsk->icsk_sync_mss = tcp_sync_mss;
1883 #ifdef CONFIG_TCP_MD5SIG
1884         tp->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886
1887         /* TCP Cookie Transactions */
1888         if (sysctl_tcp_cookie_size > 0) {
1889                 /* Default, cookies without s_data_payload. */
1890                 tp->cookie_values =
1891                         kzalloc(sizeof(*tp->cookie_values),
1892                                 sk->sk_allocation);
1893                 if (tp->cookie_values != NULL)
1894                         kref_init(&tp->cookie_values->kref);
1895         }
1896         /* Presumed zeroed, in order of appearance:
1897          *      cookie_in_always, cookie_out_never,
1898          *      s_data_constant, s_data_in, s_data_out
1899          */
1900         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1901         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1902
1903         local_bh_disable();
1904         percpu_counter_inc(&tcp_sockets_allocated);
1905         local_bh_enable();
1906
1907         return 0;
1908 }
1909
1910 void tcp_v4_destroy_sock(struct sock *sk)
1911 {
1912         struct tcp_sock *tp = tcp_sk(sk);
1913
1914         tcp_clear_xmit_timers(sk);
1915
1916         tcp_cleanup_congestion_control(sk);
1917
1918         /* Cleanup up the write buffer. */
1919         tcp_write_queue_purge(sk);
1920
1921         /* Cleans up our, hopefully empty, out_of_order_queue. */
1922         __skb_queue_purge(&tp->out_of_order_queue);
1923
1924 #ifdef CONFIG_TCP_MD5SIG
1925         /* Clean up the MD5 key list, if any */
1926         if (tp->md5sig_info) {
1927                 tcp_v4_clear_md5_list(sk);
1928                 kfree(tp->md5sig_info);
1929                 tp->md5sig_info = NULL;
1930         }
1931 #endif
1932
1933 #ifdef CONFIG_NET_DMA
1934         /* Cleans up our sk_async_wait_queue */
1935         __skb_queue_purge(&sk->sk_async_wait_queue);
1936 #endif
1937
1938         /* Clean prequeue, it must be empty really */
1939         __skb_queue_purge(&tp->ucopy.prequeue);
1940
1941         /* Clean up a referenced TCP bind bucket. */
1942         if (inet_csk(sk)->icsk_bind_hash)
1943                 inet_put_port(sk);
1944
1945         /*
1946          * If sendmsg cached page exists, toss it.
1947          */
1948         if (sk->sk_sndmsg_page) {
1949                 __free_page(sk->sk_sndmsg_page);
1950                 sk->sk_sndmsg_page = NULL;
1951         }
1952
1953         /* TCP Cookie Transactions */
1954         if (tp->cookie_values != NULL) {
1955                 kref_put(&tp->cookie_values->kref,
1956                          tcp_cookie_values_release);
1957                 tp->cookie_values = NULL;
1958         }
1959
1960         percpu_counter_dec(&tcp_sockets_allocated);
1961 }
1962 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1963
1964 #ifdef CONFIG_PROC_FS
1965 /* Proc filesystem TCP sock list dumping. */
1966
1967 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1968 {
1969         return hlist_nulls_empty(head) ? NULL :
1970                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1971 }
1972
1973 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1974 {
1975         return !is_a_nulls(tw->tw_node.next) ?
1976                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1977 }
1978
1979 /*
1980  * Get next listener socket follow cur.  If cur is NULL, get first socket
1981  * starting from bucket given in st->bucket; when st->bucket is zero the
1982  * very first socket in the hash table is returned.
1983  */
1984 static void *listening_get_next(struct seq_file *seq, void *cur)
1985 {
1986         struct inet_connection_sock *icsk;
1987         struct hlist_nulls_node *node;
1988         struct sock *sk = cur;
1989         struct inet_listen_hashbucket *ilb;
1990         struct tcp_iter_state *st = seq->private;
1991         struct net *net = seq_file_net(seq);
1992
1993         if (!sk) {
1994                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1995                 spin_lock_bh(&ilb->lock);
1996                 sk = sk_nulls_head(&ilb->head);
1997                 st->offset = 0;
1998                 goto get_sk;
1999         }
2000         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2001         ++st->num;
2002         ++st->offset;
2003
2004         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2005                 struct request_sock *req = cur;
2006
2007                 icsk = inet_csk(st->syn_wait_sk);
2008                 req = req->dl_next;
2009                 while (1) {
2010                         while (req) {
2011                                 if (req->rsk_ops->family == st->family) {
2012                                         cur = req;
2013                                         goto out;
2014                                 }
2015                                 req = req->dl_next;
2016                         }
2017                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2018                                 break;
2019 get_req:
2020                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2021                 }
2022                 sk        = sk_nulls_next(st->syn_wait_sk);
2023                 st->state = TCP_SEQ_STATE_LISTENING;
2024                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2025         } else {
2026                 icsk = inet_csk(sk);
2027                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2028                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2029                         goto start_req;
2030                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031                 sk = sk_nulls_next(sk);
2032         }
2033 get_sk:
2034         sk_nulls_for_each_from(sk, node) {
2035                 if (!net_eq(sock_net(sk), net))
2036                         continue;
2037                 if (sk->sk_family == st->family) {
2038                         cur = sk;
2039                         goto out;
2040                 }
2041                 icsk = inet_csk(sk);
2042                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2043                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2044 start_req:
2045                         st->uid         = sock_i_uid(sk);
2046                         st->syn_wait_sk = sk;
2047                         st->state       = TCP_SEQ_STATE_OPENREQ;
2048                         st->sbucket     = 0;
2049                         goto get_req;
2050                 }
2051                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2052         }
2053         spin_unlock_bh(&ilb->lock);
2054         st->offset = 0;
2055         if (++st->bucket < INET_LHTABLE_SIZE) {
2056                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2057                 spin_lock_bh(&ilb->lock);
2058                 sk = sk_nulls_head(&ilb->head);
2059                 goto get_sk;
2060         }
2061         cur = NULL;
2062 out:
2063         return cur;
2064 }
2065
2066 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2067 {
2068         struct tcp_iter_state *st = seq->private;
2069         void *rc;
2070
2071         st->bucket = 0;
2072         st->offset = 0;
2073         rc = listening_get_next(seq, NULL);
2074
2075         while (rc && *pos) {
2076                 rc = listening_get_next(seq, rc);
2077                 --*pos;
2078         }
2079         return rc;
2080 }
2081
2082 static inline int empty_bucket(struct tcp_iter_state *st)
2083 {
2084         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2085                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2086 }
2087
2088 /*
2089  * Get first established socket starting from bucket given in st->bucket.
2090  * If st->bucket is zero, the very first socket in the hash is returned.
2091  */
2092 static void *established_get_first(struct seq_file *seq)
2093 {
2094         struct tcp_iter_state *st = seq->private;
2095         struct net *net = seq_file_net(seq);
2096         void *rc = NULL;
2097
2098         st->offset = 0;
2099         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2100                 struct sock *sk;
2101                 struct hlist_nulls_node *node;
2102                 struct inet_timewait_sock *tw;
2103                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2104
2105                 /* Lockless fast path for the common case of empty buckets */
2106                 if (empty_bucket(st))
2107                         continue;
2108
2109                 spin_lock_bh(lock);
2110                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2111                         if (sk->sk_family != st->family ||
2112                             !net_eq(sock_net(sk), net)) {
2113                                 continue;
2114                         }
2115                         rc = sk;
2116                         goto out;
2117                 }
2118                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2119                 inet_twsk_for_each(tw, node,
2120                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2121                         if (tw->tw_family != st->family ||
2122                             !net_eq(twsk_net(tw), net)) {
2123                                 continue;
2124                         }
2125                         rc = tw;
2126                         goto out;
2127                 }
2128                 spin_unlock_bh(lock);
2129                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2130         }
2131 out:
2132         return rc;
2133 }
2134
2135 static void *established_get_next(struct seq_file *seq, void *cur)
2136 {
2137         struct sock *sk = cur;
2138         struct inet_timewait_sock *tw;
2139         struct hlist_nulls_node *node;
2140         struct tcp_iter_state *st = seq->private;
2141         struct net *net = seq_file_net(seq);
2142
2143         ++st->num;
2144         ++st->offset;
2145
2146         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2147                 tw = cur;
2148                 tw = tw_next(tw);
2149 get_tw:
2150                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2151                         tw = tw_next(tw);
2152                 }
2153                 if (tw) {
2154                         cur = tw;
2155                         goto out;
2156                 }
2157                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2158                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2159
2160                 /* Look for next non empty bucket */
2161                 st->offset = 0;
2162                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2163                                 empty_bucket(st))
2164                         ;
2165                 if (st->bucket > tcp_hashinfo.ehash_mask)
2166                         return NULL;
2167
2168                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2169                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2170         } else
2171                 sk = sk_nulls_next(sk);
2172
2173         sk_nulls_for_each_from(sk, node) {
2174                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2175                         goto found;
2176         }
2177
2178         st->state = TCP_SEQ_STATE_TIME_WAIT;
2179         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2180         goto get_tw;
2181 found:
2182         cur = sk;
2183 out:
2184         return cur;
2185 }
2186
2187 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2188 {
2189         struct tcp_iter_state *st = seq->private;
2190         void *rc;
2191
2192         st->bucket = 0;
2193         rc = established_get_first(seq);
2194
2195         while (rc && pos) {
2196                 rc = established_get_next(seq, rc);
2197                 --pos;
2198         }
2199         return rc;
2200 }
2201
2202 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2203 {
2204         void *rc;
2205         struct tcp_iter_state *st = seq->private;
2206
2207         st->state = TCP_SEQ_STATE_LISTENING;
2208         rc        = listening_get_idx(seq, &pos);
2209
2210         if (!rc) {
2211                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2212                 rc        = established_get_idx(seq, pos);
2213         }
2214
2215         return rc;
2216 }
2217
2218 static void *tcp_seek_last_pos(struct seq_file *seq)
2219 {
2220         struct tcp_iter_state *st = seq->private;
2221         int offset = st->offset;
2222         int orig_num = st->num;
2223         void *rc = NULL;
2224
2225         switch (st->state) {
2226         case TCP_SEQ_STATE_OPENREQ:
2227         case TCP_SEQ_STATE_LISTENING:
2228                 if (st->bucket >= INET_LHTABLE_SIZE)
2229                         break;
2230                 st->state = TCP_SEQ_STATE_LISTENING;
2231                 rc = listening_get_next(seq, NULL);
2232                 while (offset-- && rc)
2233                         rc = listening_get_next(seq, rc);
2234                 if (rc)
2235                         break;
2236                 st->bucket = 0;
2237                 /* Fallthrough */
2238         case TCP_SEQ_STATE_ESTABLISHED:
2239         case TCP_SEQ_STATE_TIME_WAIT:
2240                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2241                 if (st->bucket > tcp_hashinfo.ehash_mask)
2242                         break;
2243                 rc = established_get_first(seq);
2244                 while (offset-- && rc)
2245                         rc = established_get_next(seq, rc);
2246         }
2247
2248         st->num = orig_num;
2249
2250         return rc;
2251 }
2252
2253 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2254 {
2255         struct tcp_iter_state *st = seq->private;
2256         void *rc;
2257
2258         if (*pos && *pos == st->last_pos) {
2259                 rc = tcp_seek_last_pos(seq);
2260                 if (rc)
2261                         goto out;
2262         }
2263
2264         st->state = TCP_SEQ_STATE_LISTENING;
2265         st->num = 0;
2266         st->bucket = 0;
2267         st->offset = 0;
2268         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2269
2270 out:
2271         st->last_pos = *pos;
2272         return rc;
2273 }
2274
2275 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2276 {
2277         struct tcp_iter_state *st = seq->private;
2278         void *rc = NULL;
2279
2280         if (v == SEQ_START_TOKEN) {
2281                 rc = tcp_get_idx(seq, 0);
2282                 goto out;
2283         }
2284
2285         switch (st->state) {
2286         case TCP_SEQ_STATE_OPENREQ:
2287         case TCP_SEQ_STATE_LISTENING:
2288                 rc = listening_get_next(seq, v);
2289                 if (!rc) {
2290                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2291                         st->bucket = 0;
2292                         st->offset = 0;
2293                         rc        = established_get_first(seq);
2294                 }
2295                 break;
2296         case TCP_SEQ_STATE_ESTABLISHED:
2297         case TCP_SEQ_STATE_TIME_WAIT:
2298                 rc = established_get_next(seq, v);
2299                 break;
2300         }
2301 out:
2302         ++*pos;
2303         st->last_pos = *pos;
2304         return rc;
2305 }
2306
2307 static void tcp_seq_stop(struct seq_file *seq, void *v)
2308 {
2309         struct tcp_iter_state *st = seq->private;
2310
2311         switch (st->state) {
2312         case TCP_SEQ_STATE_OPENREQ:
2313                 if (v) {
2314                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2315                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2316                 }
2317         case TCP_SEQ_STATE_LISTENING:
2318                 if (v != SEQ_START_TOKEN)
2319                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2320                 break;
2321         case TCP_SEQ_STATE_TIME_WAIT:
2322         case TCP_SEQ_STATE_ESTABLISHED:
2323                 if (v)
2324                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2325                 break;
2326         }
2327 }
2328
2329 static int tcp_seq_open(struct inode *inode, struct file *file)
2330 {
2331         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2332         struct tcp_iter_state *s;
2333         int err;
2334
2335         err = seq_open_net(inode, file, &afinfo->seq_ops,
2336                           sizeof(struct tcp_iter_state));
2337         if (err < 0)
2338                 return err;
2339
2340         s = ((struct seq_file *)file->private_data)->private;
2341         s->family               = afinfo->family;
2342         s->last_pos             = 0;
2343         return 0;
2344 }
2345
2346 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2347 {
2348         int rc = 0;
2349         struct proc_dir_entry *p;
2350
2351         afinfo->seq_fops.open           = tcp_seq_open;
2352         afinfo->seq_fops.read           = seq_read;
2353         afinfo->seq_fops.llseek         = seq_lseek;
2354         afinfo->seq_fops.release        = seq_release_net;
2355
2356         afinfo->seq_ops.start           = tcp_seq_start;
2357         afinfo->seq_ops.next            = tcp_seq_next;
2358         afinfo->seq_ops.stop            = tcp_seq_stop;
2359
2360         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2361                              &afinfo->seq_fops, afinfo);
2362         if (!p)
2363                 rc = -ENOMEM;
2364         return rc;
2365 }
2366 EXPORT_SYMBOL(tcp_proc_register);
2367
2368 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2369 {
2370         proc_net_remove(net, afinfo->name);
2371 }
2372 EXPORT_SYMBOL(tcp_proc_unregister);
2373
2374 static void get_openreq4(struct sock *sk, struct request_sock *req,
2375                          struct seq_file *f, int i, int uid, int *len)
2376 {
2377         const struct inet_request_sock *ireq = inet_rsk(req);
2378         int ttd = req->expires - jiffies;
2379
2380         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2381                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2382                 i,
2383                 ireq->loc_addr,
2384                 ntohs(inet_sk(sk)->inet_sport),
2385                 ireq->rmt_addr,
2386                 ntohs(ireq->rmt_port),
2387                 TCP_SYN_RECV,
2388                 0, 0, /* could print option size, but that is af dependent. */
2389                 1,    /* timers active (only the expire timer) */
2390                 jiffies_to_clock_t(ttd),
2391                 req->retrans,
2392                 uid,
2393                 0,  /* non standard timer */
2394                 0, /* open_requests have no inode */
2395                 atomic_read(&sk->sk_refcnt),
2396                 req,
2397                 len);
2398 }
2399
2400 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2401 {
2402         int timer_active;
2403         unsigned long timer_expires;
2404         struct tcp_sock *tp = tcp_sk(sk);
2405         const struct inet_connection_sock *icsk = inet_csk(sk);
2406         struct inet_sock *inet = inet_sk(sk);
2407         __be32 dest = inet->inet_daddr;
2408         __be32 src = inet->inet_rcv_saddr;
2409         __u16 destp = ntohs(inet->inet_dport);
2410         __u16 srcp = ntohs(inet->inet_sport);
2411         int rx_queue;
2412
2413         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2414                 timer_active    = 1;
2415                 timer_expires   = icsk->icsk_timeout;
2416         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2417                 timer_active    = 4;
2418                 timer_expires   = icsk->icsk_timeout;
2419         } else if (timer_pending(&sk->sk_timer)) {
2420                 timer_active    = 2;
2421                 timer_expires   = sk->sk_timer.expires;
2422         } else {
2423                 timer_active    = 0;
2424                 timer_expires = jiffies;
2425         }
2426
2427         if (sk->sk_state == TCP_LISTEN)
2428                 rx_queue = sk->sk_ack_backlog;
2429         else
2430                 /*
2431                  * because we dont lock socket, we might find a transient negative value
2432                  */
2433                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2434
2435         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2437                 i, src, srcp, dest, destp, sk->sk_state,
2438                 tp->write_seq - tp->snd_una,
2439                 rx_queue,
2440                 timer_active,
2441                 jiffies_to_clock_t(timer_expires - jiffies),
2442                 icsk->icsk_retransmits,
2443                 sock_i_uid(sk),
2444                 icsk->icsk_probes_out,
2445                 sock_i_ino(sk),
2446                 atomic_read(&sk->sk_refcnt), sk,
2447                 jiffies_to_clock_t(icsk->icsk_rto),
2448                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2449                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2450                 tp->snd_cwnd,
2451                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2452                 len);
2453 }
2454
2455 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2456                                struct seq_file *f, int i, int *len)
2457 {
2458         __be32 dest, src;
2459         __u16 destp, srcp;
2460         int ttd = tw->tw_ttd - jiffies;
2461
2462         if (ttd < 0)
2463                 ttd = 0;
2464
2465         dest  = tw->tw_daddr;
2466         src   = tw->tw_rcv_saddr;
2467         destp = ntohs(tw->tw_dport);
2468         srcp  = ntohs(tw->tw_sport);
2469
2470         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2471                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2472                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2473                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2474                 atomic_read(&tw->tw_refcnt), tw, len);
2475 }
2476
2477 #define TMPSZ 150
2478
2479 static int tcp4_seq_show(struct seq_file *seq, void *v)
2480 {
2481         struct tcp_iter_state *st;
2482         int len;
2483
2484         if (v == SEQ_START_TOKEN) {
2485                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2486                            "  sl  local_address rem_address   st tx_queue "
2487                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2488                            "inode");
2489                 goto out;
2490         }
2491         st = seq->private;
2492
2493         switch (st->state) {
2494         case TCP_SEQ_STATE_LISTENING:
2495         case TCP_SEQ_STATE_ESTABLISHED:
2496                 get_tcp4_sock(v, seq, st->num, &len);
2497                 break;
2498         case TCP_SEQ_STATE_OPENREQ:
2499                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2500                 break;
2501         case TCP_SEQ_STATE_TIME_WAIT:
2502                 get_timewait4_sock(v, seq, st->num, &len);
2503                 break;
2504         }
2505         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2506 out:
2507         return 0;
2508 }
2509
2510 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2511         .name           = "tcp",
2512         .family         = AF_INET,
2513         .seq_fops       = {
2514                 .owner          = THIS_MODULE,
2515         },
2516         .seq_ops        = {
2517                 .show           = tcp4_seq_show,
2518         },
2519 };
2520
2521 static int __net_init tcp4_proc_init_net(struct net *net)
2522 {
2523         return tcp_proc_register(net, &tcp4_seq_afinfo);
2524 }
2525
2526 static void __net_exit tcp4_proc_exit_net(struct net *net)
2527 {
2528         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2529 }
2530
2531 static struct pernet_operations tcp4_net_ops = {
2532         .init = tcp4_proc_init_net,
2533         .exit = tcp4_proc_exit_net,
2534 };
2535
2536 int __init tcp4_proc_init(void)
2537 {
2538         return register_pernet_subsys(&tcp4_net_ops);
2539 }
2540
2541 void tcp4_proc_exit(void)
2542 {
2543         unregister_pernet_subsys(&tcp4_net_ops);
2544 }
2545 #endif /* CONFIG_PROC_FS */
2546
2547 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2548 {
2549         const struct iphdr *iph = skb_gro_network_header(skb);
2550
2551         switch (skb->ip_summed) {
2552         case CHECKSUM_COMPLETE:
2553                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2554                                   skb->csum)) {
2555                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2556                         break;
2557                 }
2558
2559                 /* fall through */
2560         case CHECKSUM_NONE:
2561                 NAPI_GRO_CB(skb)->flush = 1;
2562                 return NULL;
2563         }
2564
2565         return tcp_gro_receive(head, skb);
2566 }
2567
2568 int tcp4_gro_complete(struct sk_buff *skb)
2569 {
2570         const struct iphdr *iph = ip_hdr(skb);
2571         struct tcphdr *th = tcp_hdr(skb);
2572
2573         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2574                                   iph->saddr, iph->daddr, 0);
2575         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2576
2577         return tcp_gro_complete(skb);
2578 }
2579
2580 struct proto tcp_prot = {
2581         .name                   = "TCP",
2582         .owner                  = THIS_MODULE,
2583         .close                  = tcp_close,
2584         .connect                = tcp_v4_connect,
2585         .disconnect             = tcp_disconnect,
2586         .accept                 = inet_csk_accept,
2587         .ioctl                  = tcp_ioctl,
2588         .init                   = tcp_v4_init_sock,
2589         .destroy                = tcp_v4_destroy_sock,
2590         .shutdown               = tcp_shutdown,
2591         .setsockopt             = tcp_setsockopt,
2592         .getsockopt             = tcp_getsockopt,
2593         .recvmsg                = tcp_recvmsg,
2594         .sendmsg                = tcp_sendmsg,
2595         .sendpage               = tcp_sendpage,
2596         .backlog_rcv            = tcp_v4_do_rcv,
2597         .hash                   = inet_hash,
2598         .unhash                 = inet_unhash,
2599         .get_port               = inet_csk_get_port,
2600         .enter_memory_pressure  = tcp_enter_memory_pressure,
2601         .sockets_allocated      = &tcp_sockets_allocated,
2602         .orphan_count           = &tcp_orphan_count,
2603         .memory_allocated       = &tcp_memory_allocated,
2604         .memory_pressure        = &tcp_memory_pressure,
2605         .sysctl_mem             = sysctl_tcp_mem,
2606         .sysctl_wmem            = sysctl_tcp_wmem,
2607         .sysctl_rmem            = sysctl_tcp_rmem,
2608         .max_header             = MAX_TCP_HEADER,
2609         .obj_size               = sizeof(struct tcp_sock),
2610         .slab_flags             = SLAB_DESTROY_BY_RCU,
2611         .twsk_prot              = &tcp_timewait_sock_ops,
2612         .rsk_prot               = &tcp_request_sock_ops,
2613         .h.hashinfo             = &tcp_hashinfo,
2614         .no_autobind            = true,
2615 #ifdef CONFIG_COMPAT
2616         .compat_setsockopt      = compat_tcp_setsockopt,
2617         .compat_getsockopt      = compat_tcp_getsockopt,
2618 #endif
2619 };
2620 EXPORT_SYMBOL(tcp_prot);
2621
2622
2623 static int __net_init tcp_sk_init(struct net *net)
2624 {
2625         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2626                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2627 }
2628
2629 static void __net_exit tcp_sk_exit(struct net *net)
2630 {
2631         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2632 }
2633
2634 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2635 {
2636         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2637 }
2638
2639 static struct pernet_operations __net_initdata tcp_sk_ops = {
2640        .init       = tcp_sk_init,
2641        .exit       = tcp_sk_exit,
2642        .exit_batch = tcp_sk_exit_batch,
2643 };
2644
2645 void __init tcp_v4_init(void)
2646 {
2647         inet_hashinfo_init(&tcp_hashinfo);
2648         if (register_pernet_subsys(&tcp_sk_ops))
2649                 panic("Failed to create the TCP control socket.\n");
2650 }