net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 434                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         const struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         const struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653
 654         net = dev_net(skb_dst(skb)->dev);
 655         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 656                       &arg, arg.iov[0].iov_len);
 657
 658         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 659         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 660 }
 661
 662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 663    outside socket context is ugly, certainly. What can I do?
 664  */
 665
 666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 667                             u32 win, u32 ts, int oif,
 668                             struct tcp_md5sig_key *key,
 669                             int reply_flags)
 670 {
 671         const struct tcphdr *th = tcp_hdr(skb);
 672         struct {
 673                 struct tcphdr th;
 674                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 675 #ifdef CONFIG_TCP_MD5SIG
 676                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 677 #endif
 678                         ];
 679         } rep;
 680         struct ip_reply_arg arg;
 681         struct net *net = dev_net(skb_dst(skb)->dev);
 682
 683         memset(&rep.th, 0, sizeof(struct tcphdr));
 684         memset(&arg, 0, sizeof(arg));
 685
 686         arg.iov[0].iov_base = (unsigned char *)&rep;
 687         arg.iov[0].iov_len  = sizeof(rep.th);
 688         if (ts) {
 689                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 690                                    (TCPOPT_TIMESTAMP << 8) |
 691                                    TCPOLEN_TIMESTAMP);
 692                 rep.opt[1] = htonl(tcp_time_stamp);
 693                 rep.opt[2] = htonl(ts);
 694                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 695         }
 696
 697         /* Swap the send and the receive. */
 698         rep.th.dest    = th->source;
 699         rep.th.source  = th->dest;
 700         rep.th.doff    = arg.iov[0].iov_len / 4;
 701         rep.th.seq     = htonl(seq);
 702         rep.th.ack_seq = htonl(ack);
 703         rep.th.ack     = 1;
 704         rep.th.window  = htons(win);
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707         if (key) {
 708                 int offset = (ts) ? 3 : 0;
 709
 710                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 711                                           (TCPOPT_NOP << 16) |
 712                                           (TCPOPT_MD5SIG << 8) |
 713                                           TCPOLEN_MD5SIG);
 714                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 715                 rep.th.doff = arg.iov[0].iov_len/4;
 716
 717                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 718                                     key, ip_hdr(skb)->saddr,
 719                                     ip_hdr(skb)->daddr, &rep.th);
 720         }
 721 #endif
 722         arg.flags = reply_flags;
 723         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 724                                       ip_hdr(skb)->saddr, /* XXX */
 725                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 726         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 727         if (oif)
 728                 arg.bound_dev_if = oif;
 729
 730         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 731                       &arg, arg.iov[0].iov_len);
 732
 733         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 734 }
 735
 736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 737 {
 738         struct inet_timewait_sock *tw = inet_twsk(sk);
 739         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 740
 741         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 742                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 743                         tcptw->tw_ts_recent,
 744                         tw->tw_bound_dev_if,
 745                         tcp_twsk_md5_key(tcptw),
 746                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 747                         );
 748
 749         inet_twsk_put(tw);
 750 }
 751
 752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 753                                   struct request_sock *req)
 754 {
 755         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 756                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 757                         req->ts_recent,
 758                         0,
 759                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 760                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 761 }
 762
 763 /*
 764  *      Send a SYN-ACK after having received a SYN.
 765  *      This still operates on a request_sock only, not on a big
 766  *      socket.
 767  */
 768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 769                               struct request_sock *req,
 770                               struct request_values *rvp)
 771 {
 772         const struct inet_request_sock *ireq = inet_rsk(req);
 773         struct flowi4 fl4;
 774         int err = -1;
 775         struct sk_buff * skb;
 776
 777         /* First, grab a route. */
 778         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 779                 return -1;
 780
 781         skb = tcp_make_synack(sk, dst, req, rvp);
 782
 783         if (skb) {
 784                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 785
 786                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 787                                             ireq->rmt_addr,
 788                                             ireq->opt);
 789                 err = net_xmit_eval(err);
 790         }
 791
 792         dst_release(dst);
 793         return err;
 794 }
 795
 796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 797                               struct request_values *rvp)
 798 {
 799         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 800         return tcp_v4_send_synack(sk, NULL, req, rvp);
 801 }
 802
 803 /*
 804  *      IPv4 request_sock destructor.
 805  */
 806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 807 {
 808         kfree(inet_rsk(req)->opt);
 809 }
 810
 811 /*
 812  * Return 1 if a syncookie should be sent
 813  */
 814 int tcp_syn_flood_action(struct sock *sk,
 815                          const struct sk_buff *skb,
 816                          const char *proto)
 817 {
 818         const char *msg = "Dropping request";
 819         int want_cookie = 0;
 820         struct listen_sock *lopt;
 821
 822
 823
 824 #ifdef CONFIG_SYN_COOKIES
 825         if (sysctl_tcp_syncookies) {
 826                 msg = "Sending cookies";
 827                 want_cookie = 1;
 828                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 829         } else
 830 #endif
 831                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 832
 833         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 834         if (!lopt->synflood_warned) {
 835                 lopt->synflood_warned = 1;
 836                 pr_info("%s: Possible SYN flooding on port %d. %s. "
 837                         " Check SNMP counters.\n",
 838                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 839         }
 840         return want_cookie;
 841 }
 842 EXPORT_SYMBOL(tcp_syn_flood_action);
 843
 844 /*
 845  * Save and compile IPv4 options into the request_sock if needed.
 846  */
 847 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 848                                                   struct sk_buff *skb)
 849 {
 850         const struct ip_options *opt = &(IPCB(skb)->opt);
 851         struct ip_options_rcu *dopt = NULL;
 852
 853         if (opt && opt->optlen) {
 854                 int opt_size = sizeof(*dopt) + opt->optlen;
 855
 856                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 857                 if (dopt) {
 858                         if (ip_options_echo(&dopt->opt, skb)) {
 859                                 kfree(dopt);
 860                                 dopt = NULL;
 861                         }
 862                 }
 863         }
 864         return dopt;
 865 }
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868 /*
 869  * RFC2385 MD5 checksumming requires a mapping of
 870  * IP address->MD5 Key.
 871  * We need to maintain these in the sk structure.
 872  */
 873
 874 /* Find the Key structure for an address.  */
 875 static struct tcp_md5sig_key *
 876                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 877 {
 878         struct tcp_sock *tp = tcp_sk(sk);
 879         int i;
 880
 881         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 882                 return NULL;
 883         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 884                 if (tp->md5sig_info->keys4[i].addr == addr)
 885                         return &tp->md5sig_info->keys4[i].base;
 886         }
 887         return NULL;
 888 }
 889
 890 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 891                                          struct sock *addr_sk)
 892 {
 893         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 894 }
 895 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 896
 897 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 898                                                       struct request_sock *req)
 899 {
 900         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 901 }
 902
 903 /* This can be called on a newly created socket, from other files */
 904 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 905                       u8 *newkey, u8 newkeylen)
 906 {
 907         /* Add Key to the list */
 908         struct tcp_md5sig_key *key;
 909         struct tcp_sock *tp = tcp_sk(sk);
 910         struct tcp4_md5sig_key *keys;
 911
 912         key = tcp_v4_md5_do_lookup(sk, addr);
 913         if (key) {
 914                 /* Pre-existing entry - just update that one. */
 915                 kfree(key->key);
 916                 key->key = newkey;
 917                 key->keylen = newkeylen;
 918         } else {
 919                 struct tcp_md5sig_info *md5sig;
 920
 921                 if (!tp->md5sig_info) {
 922                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 923                                                   GFP_ATOMIC);
 924                         if (!tp->md5sig_info) {
 925                                 kfree(newkey);
 926                                 return -ENOMEM;
 927                         }
 928                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 929                 }
 930
 931                 md5sig = tp->md5sig_info;
 932                 if (md5sig->entries4 == 0 &&
 933                     tcp_alloc_md5sig_pool(sk) == NULL) {
 934                         kfree(newkey);
 935                         return -ENOMEM;
 936                 }
 937
 938                 if (md5sig->alloced4 == md5sig->entries4) {
 939                         keys = kmalloc((sizeof(*keys) *
 940                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 941                         if (!keys) {
 942                                 kfree(newkey);
 943                                 if (md5sig->entries4 == 0)
 944                                         tcp_free_md5sig_pool();
 945                                 return -ENOMEM;
 946                         }
 947
 948                         if (md5sig->entries4)
 949                                 memcpy(keys, md5sig->keys4,
 950                                        sizeof(*keys) * md5sig->entries4);
 951
 952                         /* Free old key list, and reference new one */
 953                         kfree(md5sig->keys4);
 954                         md5sig->keys4 = keys;
 955                         md5sig->alloced4++;
 956                 }
 957                 md5sig->entries4++;
 958                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 959                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 960                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 961         }
 962         return 0;
 963 }
 964 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 965
 966 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 967                                u8 *newkey, u8 newkeylen)
 968 {
 969         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 970                                  newkey, newkeylen);
 971 }
 972
 973 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 974 {
 975         struct tcp_sock *tp = tcp_sk(sk);
 976         int i;
 977
 978         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 979                 if (tp->md5sig_info->keys4[i].addr == addr) {
 980                         /* Free the key */
 981                         kfree(tp->md5sig_info->keys4[i].base.key);
 982                         tp->md5sig_info->entries4--;
 983
 984                         if (tp->md5sig_info->entries4 == 0) {
 985                                 kfree(tp->md5sig_info->keys4);
 986                                 tp->md5sig_info->keys4 = NULL;
 987                                 tp->md5sig_info->alloced4 = 0;
 988                                 tcp_free_md5sig_pool();
 989                         } else if (tp->md5sig_info->entries4 != i) {
 990                                 /* Need to do some manipulation */
 991                                 memmove(&tp->md5sig_info->keys4[i],
 992                                         &tp->md5sig_info->keys4[i+1],
 993                                         (tp->md5sig_info->entries4 - i) *
 994                                          sizeof(struct tcp4_md5sig_key));
 995                         }
 996                         return 0;
 997                 }
 998         }
 999         return -ENOENT;
1000 }
1001 EXPORT_SYMBOL(tcp_v4_md5_do_del);
1002
1003 static void tcp_v4_clear_md5_list(struct sock *sk)
1004 {
1005         struct tcp_sock *tp = tcp_sk(sk);
1006
1007         /* Free each key, then the set of key keys,
1008          * the crypto element, and then decrement our
1009          * hold on the last resort crypto.
1010          */
1011         if (tp->md5sig_info->entries4) {
1012                 int i;
1013                 for (i = 0; i < tp->md5sig_info->entries4; i++)
1014                         kfree(tp->md5sig_info->keys4[i].base.key);
1015                 tp->md5sig_info->entries4 = 0;
1016                 tcp_free_md5sig_pool();
1017         }
1018         if (tp->md5sig_info->keys4) {
1019                 kfree(tp->md5sig_info->keys4);
1020                 tp->md5sig_info->keys4 = NULL;
1021                 tp->md5sig_info->alloced4  = 0;
1022         }
1023 }
1024
1025 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1026                                  int optlen)
1027 {
1028         struct tcp_md5sig cmd;
1029         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1030         u8 *newkey;
1031
1032         if (optlen < sizeof(cmd))
1033                 return -EINVAL;
1034
1035         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1036                 return -EFAULT;
1037
1038         if (sin->sin_family != AF_INET)
1039                 return -EINVAL;
1040
1041         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1042                 if (!tcp_sk(sk)->md5sig_info)
1043                         return -ENOENT;
1044                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1045         }
1046
1047         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1048                 return -EINVAL;
1049
1050         if (!tcp_sk(sk)->md5sig_info) {
1051                 struct tcp_sock *tp = tcp_sk(sk);
1052                 struct tcp_md5sig_info *p;
1053
1054                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1055                 if (!p)
1056                         return -EINVAL;
1057
1058                 tp->md5sig_info = p;
1059                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1060         }
1061
1062         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1063         if (!newkey)
1064                 return -ENOMEM;
1065         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1066                                  newkey, cmd.tcpm_keylen);
1067 }
1068
1069 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1070                                         __be32 daddr, __be32 saddr, int nbytes)
1071 {
1072         struct tcp4_pseudohdr *bp;
1073         struct scatterlist sg;
1074
1075         bp = &hp->md5_blk.ip4;
1076
1077         /*
1078          * 1. the TCP pseudo-header (in the order: source IP address,
1079          * destination IP address, zero-padded protocol number, and
1080          * segment length)
1081          */
1082         bp->saddr = saddr;
1083         bp->daddr = daddr;
1084         bp->pad = 0;
1085         bp->protocol = IPPROTO_TCP;
1086         bp->len = cpu_to_be16(nbytes);
1087
1088         sg_init_one(&sg, bp, sizeof(*bp));
1089         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1090 }
1091
1092 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1093                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1094 {
1095         struct tcp_md5sig_pool *hp;
1096         struct hash_desc *desc;
1097
1098         hp = tcp_get_md5sig_pool();
1099         if (!hp)
1100                 goto clear_hash_noput;
1101         desc = &hp->md5_desc;
1102
1103         if (crypto_hash_init(desc))
1104                 goto clear_hash;
1105         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_header(hp, th))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_key(hp, key))
1110                 goto clear_hash;
1111         if (crypto_hash_final(desc, md5_hash))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123
1124 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1125                         struct sock *sk, struct request_sock *req,
1126                         struct sk_buff *skb)
1127 {
1128         struct tcp_md5sig_pool *hp;
1129         struct hash_desc *desc;
1130         struct tcphdr *th = tcp_hdr(skb);
1131         __be32 saddr, daddr;
1132
1133         if (sk) {
1134                 saddr = inet_sk(sk)->inet_saddr;
1135                 daddr = inet_sk(sk)->inet_daddr;
1136         } else if (req) {
1137                 saddr = inet_rsk(req)->loc_addr;
1138                 daddr = inet_rsk(req)->rmt_addr;
1139         } else {
1140                 const struct iphdr *iph = ip_hdr(skb);
1141                 saddr = iph->saddr;
1142                 daddr = iph->daddr;
1143         }
1144
1145         hp = tcp_get_md5sig_pool();
1146         if (!hp)
1147                 goto clear_hash_noput;
1148         desc = &hp->md5_desc;
1149
1150         if (crypto_hash_init(desc))
1151                 goto clear_hash;
1152
1153         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1154                 goto clear_hash;
1155         if (tcp_md5_hash_header(hp, th))
1156                 goto clear_hash;
1157         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1158                 goto clear_hash;
1159         if (tcp_md5_hash_key(hp, key))
1160                 goto clear_hash;
1161         if (crypto_hash_final(desc, md5_hash))
1162                 goto clear_hash;
1163
1164         tcp_put_md5sig_pool();
1165         return 0;
1166
1167 clear_hash:
1168         tcp_put_md5sig_pool();
1169 clear_hash_noput:
1170         memset(md5_hash, 0, 16);
1171         return 1;
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1174
1175 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1176 {
1177         /*
1178          * This gets called for each TCP segment that arrives
1179          * so we want to be efficient.
1180          * We have 3 drop cases:
1181          * o No MD5 hash and one expected.
1182          * o MD5 hash and we're not expecting one.
1183          * o MD5 hash and its wrong.
1184          */
1185         const __u8 *hash_location = NULL;
1186         struct tcp_md5sig_key *hash_expected;
1187         const struct iphdr *iph = ip_hdr(skb);
1188         const struct tcphdr *th = tcp_hdr(skb);
1189         int genhash;
1190         unsigned char newhash[16];
1191
1192         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1193         hash_location = tcp_parse_md5sig_option(th);
1194
1195         /* We've parsed the options - do we have a hash? */
1196         if (!hash_expected && !hash_location)
1197                 return 0;
1198
1199         if (hash_expected && !hash_location) {
1200                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1201                 return 1;
1202         }
1203
1204         if (!hash_expected && hash_location) {
1205                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1206                 return 1;
1207         }
1208
1209         /* Okay, so this is hash_expected and hash_location -
1210          * so we need to calculate the checksum.
1211          */
1212         genhash = tcp_v4_md5_hash_skb(newhash,
1213                                       hash_expected,
1214                                       NULL, NULL, skb);
1215
1216         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1217                 if (net_ratelimit()) {
1218                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1219                                &iph->saddr, ntohs(th->source),
1220                                &iph->daddr, ntohs(th->dest),
1221                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1222                 }
1223                 return 1;
1224         }
1225         return 0;
1226 }
1227
1228 #endif
1229
1230 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1231         .family         =       PF_INET,
1232         .obj_size       =       sizeof(struct tcp_request_sock),
1233         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1234         .send_ack       =       tcp_v4_reqsk_send_ack,
1235         .destructor     =       tcp_v4_reqsk_destructor,
1236         .send_reset     =       tcp_v4_send_reset,
1237         .syn_ack_timeout =      tcp_syn_ack_timeout,
1238 };
1239
1240 #ifdef CONFIG_TCP_MD5SIG
1241 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1242         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1243         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1244 };
1245 #endif
1246
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249         struct tcp_extend_values tmp_ext;
1250         struct tcp_options_received tmp_opt;
1251         const u8 *hash_location;
1252         struct request_sock *req;
1253         struct inet_request_sock *ireq;
1254         struct tcp_sock *tp = tcp_sk(sk);
1255         struct dst_entry *dst = NULL;
1256         __be32 saddr = ip_hdr(skb)->saddr;
1257         __be32 daddr = ip_hdr(skb)->daddr;
1258         __u32 isn = TCP_SKB_CB(skb)->when;
1259         int want_cookie = 0;
1260
1261         /* Never answer to SYNs send to broadcast or multicast */
1262         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1263                 goto drop;
1264
1265         /* TW buckets are converted to open requests without
1266          * limitations, they conserve resources and peer is
1267          * evidently real one.
1268          */
1269         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1271                 if (!want_cookie)
1272                         goto drop;
1273         }
1274
1275         /* Accept backlog is full. If we have already queued enough
1276          * of warm entries in syn queue, drop request. It is better than
1277          * clogging syn queue with openreqs with exponentially increasing
1278          * timeout.
1279          */
1280         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1281                 goto drop;
1282
1283         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1284         if (!req)
1285                 goto drop;
1286
1287 #ifdef CONFIG_TCP_MD5SIG
1288         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1289 #endif
1290
1291         tcp_clear_options(&tmp_opt);
1292         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1293         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1294         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1295
1296         if (tmp_opt.cookie_plus > 0 &&
1297             tmp_opt.saw_tstamp &&
1298             !tp->rx_opt.cookie_out_never &&
1299             (sysctl_tcp_cookie_size > 0 ||
1300              (tp->cookie_values != NULL &&
1301               tp->cookie_values->cookie_desired > 0))) {
1302                 u8 *c;
1303                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1304                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1305
1306                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1307                         goto drop_and_release;
1308
1309                 /* Secret recipe starts with IP addresses */
1310                 *mess++ ^= (__force u32)daddr;
1311                 *mess++ ^= (__force u32)saddr;
1312
1313                 /* plus variable length Initiator Cookie */
1314                 c = (u8 *)mess;
1315                 while (l-- > 0)
1316                         *c++ ^= *hash_location++;
1317
1318                 want_cookie = 0;        /* not our kind of cookie */
1319                 tmp_ext.cookie_out_never = 0; /* false */
1320                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1321         } else if (!tp->rx_opt.cookie_in_always) {
1322                 /* redundant indications, but ensure initialization. */
1323                 tmp_ext.cookie_out_never = 1; /* true */
1324                 tmp_ext.cookie_plus = 0;
1325         } else {
1326                 goto drop_and_release;
1327         }
1328         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1329
1330         if (want_cookie && !tmp_opt.saw_tstamp)
1331                 tcp_clear_options(&tmp_opt);
1332
1333         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1334         tcp_openreq_init(req, &tmp_opt, skb);
1335
1336         ireq = inet_rsk(req);
1337         ireq->loc_addr = daddr;
1338         ireq->rmt_addr = saddr;
1339         ireq->no_srccheck = inet_sk(sk)->transparent;
1340         ireq->opt = tcp_v4_save_options(sk, skb);
1341
1342         if (security_inet_conn_request(sk, skb, req))
1343                 goto drop_and_free;
1344
1345         if (!want_cookie || tmp_opt.tstamp_ok)
1346                 TCP_ECN_create_request(req, tcp_hdr(skb));
1347
1348         if (want_cookie) {
1349                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1350                 req->cookie_ts = tmp_opt.tstamp_ok;
1351         } else if (!isn) {
1352                 struct inet_peer *peer = NULL;
1353                 struct flowi4 fl4;
1354
1355                 /* VJ's idea. We save last timestamp seen
1356                  * from the destination in peer table, when entering
1357                  * state TIME-WAIT, and check against it before
1358                  * accepting new connection request.
1359                  *
1360                  * If "isn" is not zero, this request hit alive
1361                  * timewait bucket, so that all the necessary checks
1362                  * are made in the function processing timewait state.
1363                  */
1364                 if (tmp_opt.saw_tstamp &&
1365                     tcp_death_row.sysctl_tw_recycle &&
1366                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1367                     fl4.daddr == saddr &&
1368                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1369                         inet_peer_refcheck(peer);
1370                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1371                             (s32)(peer->tcp_ts - req->ts_recent) >
1372                                                         TCP_PAWS_WINDOW) {
1373                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1374                                 goto drop_and_release;
1375                         }
1376                 }
1377                 /* Kill the following clause, if you dislike this way. */
1378                 else if (!sysctl_tcp_syncookies &&
1379                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1380                           (sysctl_max_syn_backlog >> 2)) &&
1381                          (!peer || !peer->tcp_ts_stamp) &&
1382                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1383                         /* Without syncookies last quarter of
1384                          * backlog is filled with destinations,
1385                          * proven to be alive.
1386                          * It means that we continue to communicate
1387                          * to destinations, already remembered
1388                          * to the moment of synflood.
1389                          */
1390                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1391                                        &saddr, ntohs(tcp_hdr(skb)->source));
1392                         goto drop_and_release;
1393                 }
1394
1395                 isn = tcp_v4_init_sequence(skb);
1396         }
1397         tcp_rsk(req)->snt_isn = isn;
1398         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1399
1400         if (tcp_v4_send_synack(sk, dst, req,
1401                                (struct request_values *)&tmp_ext) ||
1402             want_cookie)
1403                 goto drop_and_free;
1404
1405         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1406         return 0;
1407
1408 drop_and_release:
1409         dst_release(dst);
1410 drop_and_free:
1411         reqsk_free(req);
1412 drop:
1413         return 0;
1414 }
1415 EXPORT_SYMBOL(tcp_v4_conn_request);
1416
1417
1418 /*
1419  * The three way handshake has completed - we got a valid synack -
1420  * now create the new socket.
1421  */
1422 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1423                                   struct request_sock *req,
1424                                   struct dst_entry *dst)
1425 {
1426         struct inet_request_sock *ireq;
1427         struct inet_sock *newinet;
1428         struct tcp_sock *newtp;
1429         struct sock *newsk;
1430 #ifdef CONFIG_TCP_MD5SIG
1431         struct tcp_md5sig_key *key;
1432 #endif
1433         struct ip_options_rcu *inet_opt;
1434
1435         if (sk_acceptq_is_full(sk))
1436                 goto exit_overflow;
1437
1438         newsk = tcp_create_openreq_child(sk, req, skb);
1439         if (!newsk)
1440                 goto exit_nonewsk;
1441
1442         newsk->sk_gso_type = SKB_GSO_TCPV4;
1443
1444         newtp                 = tcp_sk(newsk);
1445         newinet               = inet_sk(newsk);
1446         ireq                  = inet_rsk(req);
1447         newinet->inet_daddr   = ireq->rmt_addr;
1448         newinet->inet_rcv_saddr = ireq->loc_addr;
1449         newinet->inet_saddr           = ireq->loc_addr;
1450         inet_opt              = ireq->opt;
1451         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1452         ireq->opt             = NULL;
1453         newinet->mc_index     = inet_iif(skb);
1454         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1455         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1456         if (inet_opt)
1457                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1458         newinet->inet_id = newtp->write_seq ^ jiffies;
1459
1460         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1461                 goto put_and_exit;
1462
1463         sk_setup_caps(newsk, dst);
1464
1465         tcp_mtup_init(newsk);
1466         tcp_sync_mss(newsk, dst_mtu(dst));
1467         newtp->advmss = dst_metric_advmss(dst);
1468         if (tcp_sk(sk)->rx_opt.user_mss &&
1469             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1470                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1471
1472         tcp_initialize_rcv_mss(newsk);
1473         if (tcp_rsk(req)->snt_synack)
1474                 tcp_valid_rtt_meas(newsk,
1475                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1476         newtp->total_retrans = req->retrans;
1477
1478 #ifdef CONFIG_TCP_MD5SIG
1479         /* Copy over the MD5 key from the original socket */
1480         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1481         if (key != NULL) {
1482                 /*
1483                  * We're using one, so create a matching key
1484                  * on the newsk structure. If we fail to get
1485                  * memory, then we end up not copying the key
1486                  * across. Shucks.
1487                  */
1488                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1489                 if (newkey != NULL)
1490                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1491                                           newkey, key->keylen);
1492                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1493         }
1494 #endif
1495
1496         if (__inet_inherit_port(sk, newsk) < 0)
1497                 goto put_and_exit;
1498         __inet_hash_nolisten(newsk, NULL);
1499
1500         return newsk;
1501
1502 exit_overflow:
1503         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1504 exit_nonewsk:
1505         dst_release(dst);
1506 exit:
1507         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1508         return NULL;
1509 put_and_exit:
1510         sock_put(newsk);
1511         goto exit;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1514
1515 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1516 {
1517         struct tcphdr *th = tcp_hdr(skb);
1518         const struct iphdr *iph = ip_hdr(skb);
1519         struct sock *nsk;
1520         struct request_sock **prev;
1521         /* Find possible connection requests. */
1522         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1523                                                        iph->saddr, iph->daddr);
1524         if (req)
1525                 return tcp_check_req(sk, skb, req, prev);
1526
1527         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1528                         th->source, iph->daddr, th->dest, inet_iif(skb));
1529
1530         if (nsk) {
1531                 if (nsk->sk_state != TCP_TIME_WAIT) {
1532                         bh_lock_sock(nsk);
1533                         return nsk;
1534                 }
1535                 inet_twsk_put(inet_twsk(nsk));
1536                 return NULL;
1537         }
1538
1539 #ifdef CONFIG_SYN_COOKIES
1540         if (!th->syn)
1541                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1542 #endif
1543         return sk;
1544 }
1545
1546 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1547 {
1548         const struct iphdr *iph = ip_hdr(skb);
1549
1550         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1551                 if (!tcp_v4_check(skb->len, iph->saddr,
1552                                   iph->daddr, skb->csum)) {
1553                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1554                         return 0;
1555                 }
1556         }
1557
1558         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1559                                        skb->len, IPPROTO_TCP, 0);
1560
1561         if (skb->len <= 76) {
1562                 return __skb_checksum_complete(skb);
1563         }
1564         return 0;
1565 }
1566
1567
1568 /* The socket must have it's spinlock held when we get
1569  * here.
1570  *
1571  * We have a potential double-lock case here, so even when
1572  * doing backlog processing we use the BH locking scheme.
1573  * This is because we cannot sleep with the original spinlock
1574  * held.
1575  */
1576 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1577 {
1578         struct sock *rsk;
1579 #ifdef CONFIG_TCP_MD5SIG
1580         /*
1581          * We really want to reject the packet as early as possible
1582          * if:
1583          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1584          *  o There is an MD5 option and we're not expecting one
1585          */
1586         if (tcp_v4_inbound_md5_hash(sk, skb))
1587                 goto discard;
1588 #endif
1589
1590         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1591                 sock_rps_save_rxhash(sk, skb);
1592                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1593                         rsk = sk;
1594                         goto reset;
1595                 }
1596                 return 0;
1597         }
1598
1599         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1600                 goto csum_err;
1601
1602         if (sk->sk_state == TCP_LISTEN) {
1603                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1604                 if (!nsk)
1605                         goto discard;
1606
1607                 if (nsk != sk) {
1608                         sock_rps_save_rxhash(nsk, skb);
1609                         if (tcp_child_process(sk, nsk, skb)) {
1610                                 rsk = nsk;
1611                                 goto reset;
1612                         }
1613                         return 0;
1614                 }
1615         } else
1616                 sock_rps_save_rxhash(sk, skb);
1617
1618         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1619                 rsk = sk;
1620                 goto reset;
1621         }
1622         return 0;
1623
1624 reset:
1625         tcp_v4_send_reset(rsk, skb);
1626 discard:
1627         kfree_skb(skb);
1628         /* Be careful here. If this function gets more complicated and
1629          * gcc suffers from register pressure on the x86, sk (in %ebx)
1630          * might be destroyed here. This current version compiles correctly,
1631          * but you have been warned.
1632          */
1633         return 0;
1634
1635 csum_err:
1636         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1637         goto discard;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_do_rcv);
1640
1641 /*
1642  *      From tcp_input.c
1643  */
1644
1645 int tcp_v4_rcv(struct sk_buff *skb)
1646 {
1647         const struct iphdr *iph;
1648         const struct tcphdr *th;
1649         struct sock *sk;
1650         int ret;
1651         struct net *net = dev_net(skb->dev);
1652
1653         if (skb->pkt_type != PACKET_HOST)
1654                 goto discard_it;
1655
1656         /* Count it even if it's bad */
1657         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1658
1659         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1660                 goto discard_it;
1661
1662         th = tcp_hdr(skb);
1663
1664         if (th->doff < sizeof(struct tcphdr) / 4)
1665                 goto bad_packet;
1666         if (!pskb_may_pull(skb, th->doff * 4))
1667                 goto discard_it;
1668
1669         /* An explanation is required here, I think.
1670          * Packet length and doff are validated by header prediction,
1671          * provided case of th->doff==0 is eliminated.
1672          * So, we defer the checks. */
1673         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1674                 goto bad_packet;
1675
1676         th = tcp_hdr(skb);
1677         iph = ip_hdr(skb);
1678         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1679         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1680                                     skb->len - th->doff * 4);
1681         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1682         TCP_SKB_CB(skb)->when    = 0;
1683         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1684         TCP_SKB_CB(skb)->sacked  = 0;
1685
1686         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1687         if (!sk)
1688                 goto no_tcp_socket;
1689
1690 process:
1691         if (sk->sk_state == TCP_TIME_WAIT)
1692                 goto do_time_wait;
1693
1694         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1695                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1696                 goto discard_and_relse;
1697         }
1698
1699         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1700                 goto discard_and_relse;
1701         nf_reset(skb);
1702
1703         if (sk_filter(sk, skb))
1704                 goto discard_and_relse;
1705
1706         skb->dev = NULL;
1707
1708         bh_lock_sock_nested(sk);
1709         ret = 0;
1710         if (!sock_owned_by_user(sk)) {
1711 #ifdef CONFIG_NET_DMA
1712                 struct tcp_sock *tp = tcp_sk(sk);
1713                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1714                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1715                 if (tp->ucopy.dma_chan)
1716                         ret = tcp_v4_do_rcv(sk, skb);
1717                 else
1718 #endif
1719                 {
1720                         if (!tcp_prequeue(sk, skb))
1721                                 ret = tcp_v4_do_rcv(sk, skb);
1722                 }
1723         } else if (unlikely(sk_add_backlog(sk, skb))) {
1724                 bh_unlock_sock(sk);
1725                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1726                 goto discard_and_relse;
1727         }
1728         bh_unlock_sock(sk);
1729
1730         sock_put(sk);
1731
1732         return ret;
1733
1734 no_tcp_socket:
1735         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1736                 goto discard_it;
1737
1738         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1739 bad_packet:
1740                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1741         } else {
1742                 tcp_v4_send_reset(NULL, skb);
1743         }
1744
1745 discard_it:
1746         /* Discard frame. */
1747         kfree_skb(skb);
1748         return 0;
1749
1750 discard_and_relse:
1751         sock_put(sk);
1752         goto discard_it;
1753
1754 do_time_wait:
1755         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1756                 inet_twsk_put(inet_twsk(sk));
1757                 goto discard_it;
1758         }
1759
1760         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1761                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1762                 inet_twsk_put(inet_twsk(sk));
1763                 goto discard_it;
1764         }
1765         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1766         case TCP_TW_SYN: {
1767                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1768                                                         &tcp_hashinfo,
1769                                                         iph->daddr, th->dest,
1770                                                         inet_iif(skb));
1771                 if (sk2) {
1772                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1773                         inet_twsk_put(inet_twsk(sk));
1774                         sk = sk2;
1775                         goto process;
1776                 }
1777                 /* Fall through to ACK */
1778         }
1779         case TCP_TW_ACK:
1780                 tcp_v4_timewait_ack(sk, skb);
1781                 break;
1782         case TCP_TW_RST:
1783                 goto no_tcp_socket;
1784         case TCP_TW_SUCCESS:;
1785         }
1786         goto discard_it;
1787 }
1788
1789 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1790 {
1791         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1792         struct inet_sock *inet = inet_sk(sk);
1793         struct inet_peer *peer;
1794
1795         if (!rt ||
1796             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1797                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1798                 *release_it = true;
1799         } else {
1800                 if (!rt->peer)
1801                         rt_bind_peer(rt, inet->inet_daddr, 1);
1802                 peer = rt->peer;
1803                 *release_it = false;
1804         }
1805
1806         return peer;
1807 }
1808 EXPORT_SYMBOL(tcp_v4_get_peer);
1809
1810 void *tcp_v4_tw_get_peer(struct sock *sk)
1811 {
1812         const struct inet_timewait_sock *tw = inet_twsk(sk);
1813
1814         return inet_getpeer_v4(tw->tw_daddr, 1);
1815 }
1816 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1817
1818 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1819         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1820         .twsk_unique    = tcp_twsk_unique,
1821         .twsk_destructor= tcp_twsk_destructor,
1822         .twsk_getpeer   = tcp_v4_tw_get_peer,
1823 };
1824
1825 const struct inet_connection_sock_af_ops ipv4_specific = {
1826         .queue_xmit        = ip_queue_xmit,
1827         .send_check        = tcp_v4_send_check,
1828         .rebuild_header    = inet_sk_rebuild_header,
1829         .conn_request      = tcp_v4_conn_request,
1830         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1831         .get_peer          = tcp_v4_get_peer,
1832         .net_header_len    = sizeof(struct iphdr),
1833         .setsockopt        = ip_setsockopt,
1834         .getsockopt        = ip_getsockopt,
1835         .addr2sockaddr     = inet_csk_addr2sockaddr,
1836         .sockaddr_len      = sizeof(struct sockaddr_in),
1837         .bind_conflict     = inet_csk_bind_conflict,
1838 #ifdef CONFIG_COMPAT
1839         .compat_setsockopt = compat_ip_setsockopt,
1840         .compat_getsockopt = compat_ip_getsockopt,
1841 #endif
1842 };
1843 EXPORT_SYMBOL(ipv4_specific);
1844
1845 #ifdef CONFIG_TCP_MD5SIG
1846 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1847         .md5_lookup             = tcp_v4_md5_lookup,
1848         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1849         .md5_add                = tcp_v4_md5_add_func,
1850         .md5_parse              = tcp_v4_parse_md5_keys,
1851 };
1852 #endif
1853
1854 /* NOTE: A lot of things set to zero explicitly by call to
1855  *       sk_alloc() so need not be done here.
1856  */
1857 static int tcp_v4_init_sock(struct sock *sk)
1858 {
1859         struct inet_connection_sock *icsk = inet_csk(sk);
1860         struct tcp_sock *tp = tcp_sk(sk);
1861
1862         skb_queue_head_init(&tp->out_of_order_queue);
1863         tcp_init_xmit_timers(sk);
1864         tcp_prequeue_init(tp);
1865
1866         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1867         tp->mdev = TCP_TIMEOUT_INIT;
1868
1869         /* So many TCP implementations out there (incorrectly) count the
1870          * initial SYN frame in their delayed-ACK and congestion control
1871          * algorithms that we must have the following bandaid to talk
1872          * efficiently to them.  -DaveM
1873          */
1874         tp->snd_cwnd = TCP_INIT_CWND;
1875
1876         /* See draft-stevens-tcpca-spec-01 for discussion of the
1877          * initialization of these values.
1878          */
1879         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1880         tp->snd_cwnd_clamp = ~0;
1881         tp->mss_cache = TCP_MSS_DEFAULT;
1882
1883         tp->reordering = sysctl_tcp_reordering;
1884         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1885
1886         sk->sk_state = TCP_CLOSE;
1887
1888         sk->sk_write_space = sk_stream_write_space;
1889         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1890
1891         icsk->icsk_af_ops = &ipv4_specific;
1892         icsk->icsk_sync_mss = tcp_sync_mss;
1893 #ifdef CONFIG_TCP_MD5SIG
1894         tp->af_specific = &tcp_sock_ipv4_specific;
1895 #endif
1896
1897         /* TCP Cookie Transactions */
1898         if (sysctl_tcp_cookie_size > 0) {
1899                 /* Default, cookies without s_data_payload. */
1900                 tp->cookie_values =
1901                         kzalloc(sizeof(*tp->cookie_values),
1902                                 sk->sk_allocation);
1903                 if (tp->cookie_values != NULL)
1904                         kref_init(&tp->cookie_values->kref);
1905         }
1906         /* Presumed zeroed, in order of appearance:
1907          *      cookie_in_always, cookie_out_never,
1908          *      s_data_constant, s_data_in, s_data_out
1909          */
1910         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1911         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1912
1913         local_bh_disable();
1914         percpu_counter_inc(&tcp_sockets_allocated);
1915         local_bh_enable();
1916
1917         return 0;
1918 }
1919
1920 void tcp_v4_destroy_sock(struct sock *sk)
1921 {
1922         struct tcp_sock *tp = tcp_sk(sk);
1923
1924         tcp_clear_xmit_timers(sk);
1925
1926         tcp_cleanup_congestion_control(sk);
1927
1928         /* Cleanup up the write buffer. */
1929         tcp_write_queue_purge(sk);
1930
1931         /* Cleans up our, hopefully empty, out_of_order_queue. */
1932         __skb_queue_purge(&tp->out_of_order_queue);
1933
1934 #ifdef CONFIG_TCP_MD5SIG
1935         /* Clean up the MD5 key list, if any */
1936         if (tp->md5sig_info) {
1937                 tcp_v4_clear_md5_list(sk);
1938                 kfree(tp->md5sig_info);
1939                 tp->md5sig_info = NULL;
1940         }
1941 #endif
1942
1943 #ifdef CONFIG_NET_DMA
1944         /* Cleans up our sk_async_wait_queue */
1945         __skb_queue_purge(&sk->sk_async_wait_queue);
1946 #endif
1947
1948         /* Clean prequeue, it must be empty really */
1949         __skb_queue_purge(&tp->ucopy.prequeue);
1950
1951         /* Clean up a referenced TCP bind bucket. */
1952         if (inet_csk(sk)->icsk_bind_hash)
1953                 inet_put_port(sk);
1954
1955         /*
1956          * If sendmsg cached page exists, toss it.
1957          */
1958         if (sk->sk_sndmsg_page) {
1959                 __free_page(sk->sk_sndmsg_page);
1960                 sk->sk_sndmsg_page = NULL;
1961         }
1962
1963         /* TCP Cookie Transactions */
1964         if (tp->cookie_values != NULL) {
1965                 kref_put(&tp->cookie_values->kref,
1966                          tcp_cookie_values_release);
1967                 tp->cookie_values = NULL;
1968         }
1969
1970         percpu_counter_dec(&tcp_sockets_allocated);
1971 }
1972 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1973
1974 #ifdef CONFIG_PROC_FS
1975 /* Proc filesystem TCP sock list dumping. */
1976
1977 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1978 {
1979         return hlist_nulls_empty(head) ? NULL :
1980                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1981 }
1982
1983 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1984 {
1985         return !is_a_nulls(tw->tw_node.next) ?
1986                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1987 }
1988
1989 /*
1990  * Get next listener socket follow cur.  If cur is NULL, get first socket
1991  * starting from bucket given in st->bucket; when st->bucket is zero the
1992  * very first socket in the hash table is returned.
1993  */
1994 static void *listening_get_next(struct seq_file *seq, void *cur)
1995 {
1996         struct inet_connection_sock *icsk;
1997         struct hlist_nulls_node *node;
1998         struct sock *sk = cur;
1999         struct inet_listen_hashbucket *ilb;
2000         struct tcp_iter_state *st = seq->private;
2001         struct net *net = seq_file_net(seq);
2002
2003         if (!sk) {
2004                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2005                 spin_lock_bh(&ilb->lock);
2006                 sk = sk_nulls_head(&ilb->head);
2007                 st->offset = 0;
2008                 goto get_sk;
2009         }
2010         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2011         ++st->num;
2012         ++st->offset;
2013
2014         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2015                 struct request_sock *req = cur;
2016
2017                 icsk = inet_csk(st->syn_wait_sk);
2018                 req = req->dl_next;
2019                 while (1) {
2020                         while (req) {
2021                                 if (req->rsk_ops->family == st->family) {
2022                                         cur = req;
2023                                         goto out;
2024                                 }
2025                                 req = req->dl_next;
2026                         }
2027                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2028                                 break;
2029 get_req:
2030                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2031                 }
2032                 sk        = sk_nulls_next(st->syn_wait_sk);
2033                 st->state = TCP_SEQ_STATE_LISTENING;
2034                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035         } else {
2036                 icsk = inet_csk(sk);
2037                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2039                         goto start_req;
2040                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2041                 sk = sk_nulls_next(sk);
2042         }
2043 get_sk:
2044         sk_nulls_for_each_from(sk, node) {
2045                 if (!net_eq(sock_net(sk), net))
2046                         continue;
2047                 if (sk->sk_family == st->family) {
2048                         cur = sk;
2049                         goto out;
2050                 }
2051                 icsk = inet_csk(sk);
2052                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2054 start_req:
2055                         st->uid         = sock_i_uid(sk);
2056                         st->syn_wait_sk = sk;
2057                         st->state       = TCP_SEQ_STATE_OPENREQ;
2058                         st->sbucket     = 0;
2059                         goto get_req;
2060                 }
2061                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2062         }
2063         spin_unlock_bh(&ilb->lock);
2064         st->offset = 0;
2065         if (++st->bucket < INET_LHTABLE_SIZE) {
2066                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2067                 spin_lock_bh(&ilb->lock);
2068                 sk = sk_nulls_head(&ilb->head);
2069                 goto get_sk;
2070         }
2071         cur = NULL;
2072 out:
2073         return cur;
2074 }
2075
2076 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2077 {
2078         struct tcp_iter_state *st = seq->private;
2079         void *rc;
2080
2081         st->bucket = 0;
2082         st->offset = 0;
2083         rc = listening_get_next(seq, NULL);
2084
2085         while (rc && *pos) {
2086                 rc = listening_get_next(seq, rc);
2087                 --*pos;
2088         }
2089         return rc;
2090 }
2091
2092 static inline int empty_bucket(struct tcp_iter_state *st)
2093 {
2094         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2095                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2096 }
2097
2098 /*
2099  * Get first established socket starting from bucket given in st->bucket.
2100  * If st->bucket is zero, the very first socket in the hash is returned.
2101  */
2102 static void *established_get_first(struct seq_file *seq)
2103 {
2104         struct tcp_iter_state *st = seq->private;
2105         struct net *net = seq_file_net(seq);
2106         void *rc = NULL;
2107
2108         st->offset = 0;
2109         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2110                 struct sock *sk;
2111                 struct hlist_nulls_node *node;
2112                 struct inet_timewait_sock *tw;
2113                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2114
2115                 /* Lockless fast path for the common case of empty buckets */
2116                 if (empty_bucket(st))
2117                         continue;
2118
2119                 spin_lock_bh(lock);
2120                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2121                         if (sk->sk_family != st->family ||
2122                             !net_eq(sock_net(sk), net)) {
2123                                 continue;
2124                         }
2125                         rc = sk;
2126                         goto out;
2127                 }
2128                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2129                 inet_twsk_for_each(tw, node,
2130                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2131                         if (tw->tw_family != st->family ||
2132                             !net_eq(twsk_net(tw), net)) {
2133                                 continue;
2134                         }
2135                         rc = tw;
2136                         goto out;
2137                 }
2138                 spin_unlock_bh(lock);
2139                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2140         }
2141 out:
2142         return rc;
2143 }
2144
2145 static void *established_get_next(struct seq_file *seq, void *cur)
2146 {
2147         struct sock *sk = cur;
2148         struct inet_timewait_sock *tw;
2149         struct hlist_nulls_node *node;
2150         struct tcp_iter_state *st = seq->private;
2151         struct net *net = seq_file_net(seq);
2152
2153         ++st->num;
2154         ++st->offset;
2155
2156         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2157                 tw = cur;
2158                 tw = tw_next(tw);
2159 get_tw:
2160                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2161                         tw = tw_next(tw);
2162                 }
2163                 if (tw) {
2164                         cur = tw;
2165                         goto out;
2166                 }
2167                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2168                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169
2170                 /* Look for next non empty bucket */
2171                 st->offset = 0;
2172                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2173                                 empty_bucket(st))
2174                         ;
2175                 if (st->bucket > tcp_hashinfo.ehash_mask)
2176                         return NULL;
2177
2178                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2179                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2180         } else
2181                 sk = sk_nulls_next(sk);
2182
2183         sk_nulls_for_each_from(sk, node) {
2184                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2185                         goto found;
2186         }
2187
2188         st->state = TCP_SEQ_STATE_TIME_WAIT;
2189         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2190         goto get_tw;
2191 found:
2192         cur = sk;
2193 out:
2194         return cur;
2195 }
2196
2197 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2198 {
2199         struct tcp_iter_state *st = seq->private;
2200         void *rc;
2201
2202         st->bucket = 0;
2203         rc = established_get_first(seq);
2204
2205         while (rc && pos) {
2206                 rc = established_get_next(seq, rc);
2207                 --pos;
2208         }
2209         return rc;
2210 }
2211
2212 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2213 {
2214         void *rc;
2215         struct tcp_iter_state *st = seq->private;
2216
2217         st->state = TCP_SEQ_STATE_LISTENING;
2218         rc        = listening_get_idx(seq, &pos);
2219
2220         if (!rc) {
2221                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222                 rc        = established_get_idx(seq, pos);
2223         }
2224
2225         return rc;
2226 }
2227
2228 static void *tcp_seek_last_pos(struct seq_file *seq)
2229 {
2230         struct tcp_iter_state *st = seq->private;
2231         int offset = st->offset;
2232         int orig_num = st->num;
2233         void *rc = NULL;
2234
2235         switch (st->state) {
2236         case TCP_SEQ_STATE_OPENREQ:
2237         case TCP_SEQ_STATE_LISTENING:
2238                 if (st->bucket >= INET_LHTABLE_SIZE)
2239                         break;
2240                 st->state = TCP_SEQ_STATE_LISTENING;
2241                 rc = listening_get_next(seq, NULL);
2242                 while (offset-- && rc)
2243                         rc = listening_get_next(seq, rc);
2244                 if (rc)
2245                         break;
2246                 st->bucket = 0;
2247                 /* Fallthrough */
2248         case TCP_SEQ_STATE_ESTABLISHED:
2249         case TCP_SEQ_STATE_TIME_WAIT:
2250                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2251                 if (st->bucket > tcp_hashinfo.ehash_mask)
2252                         break;
2253                 rc = established_get_first(seq);
2254                 while (offset-- && rc)
2255                         rc = established_get_next(seq, rc);
2256         }
2257
2258         st->num = orig_num;
2259
2260         return rc;
2261 }
2262
2263 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2264 {
2265         struct tcp_iter_state *st = seq->private;
2266         void *rc;
2267
2268         if (*pos && *pos == st->last_pos) {
2269                 rc = tcp_seek_last_pos(seq);
2270                 if (rc)
2271                         goto out;
2272         }
2273
2274         st->state = TCP_SEQ_STATE_LISTENING;
2275         st->num = 0;
2276         st->bucket = 0;
2277         st->offset = 0;
2278         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2279
2280 out:
2281         st->last_pos = *pos;
2282         return rc;
2283 }
2284
2285 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2286 {
2287         struct tcp_iter_state *st = seq->private;
2288         void *rc = NULL;
2289
2290         if (v == SEQ_START_TOKEN) {
2291                 rc = tcp_get_idx(seq, 0);
2292                 goto out;
2293         }
2294
2295         switch (st->state) {
2296         case TCP_SEQ_STATE_OPENREQ:
2297         case TCP_SEQ_STATE_LISTENING:
2298                 rc = listening_get_next(seq, v);
2299                 if (!rc) {
2300                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2301                         st->bucket = 0;
2302                         st->offset = 0;
2303                         rc        = established_get_first(seq);
2304                 }
2305                 break;
2306         case TCP_SEQ_STATE_ESTABLISHED:
2307         case TCP_SEQ_STATE_TIME_WAIT:
2308                 rc = established_get_next(seq, v);
2309                 break;
2310         }
2311 out:
2312         ++*pos;
2313         st->last_pos = *pos;
2314         return rc;
2315 }
2316
2317 static void tcp_seq_stop(struct seq_file *seq, void *v)
2318 {
2319         struct tcp_iter_state *st = seq->private;
2320
2321         switch (st->state) {
2322         case TCP_SEQ_STATE_OPENREQ:
2323                 if (v) {
2324                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2325                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2326                 }
2327         case TCP_SEQ_STATE_LISTENING:
2328                 if (v != SEQ_START_TOKEN)
2329                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2330                 break;
2331         case TCP_SEQ_STATE_TIME_WAIT:
2332         case TCP_SEQ_STATE_ESTABLISHED:
2333                 if (v)
2334                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2335                 break;
2336         }
2337 }
2338
2339 static int tcp_seq_open(struct inode *inode, struct file *file)
2340 {
2341         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2342         struct tcp_iter_state *s;
2343         int err;
2344
2345         err = seq_open_net(inode, file, &afinfo->seq_ops,
2346                           sizeof(struct tcp_iter_state));
2347         if (err < 0)
2348                 return err;
2349
2350         s = ((struct seq_file *)file->private_data)->private;
2351         s->family               = afinfo->family;
2352         s->last_pos             = 0;
2353         return 0;
2354 }
2355
2356 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2357 {
2358         int rc = 0;
2359         struct proc_dir_entry *p;
2360
2361         afinfo->seq_fops.open           = tcp_seq_open;
2362         afinfo->seq_fops.read           = seq_read;
2363         afinfo->seq_fops.llseek         = seq_lseek;
2364         afinfo->seq_fops.release        = seq_release_net;
2365
2366         afinfo->seq_ops.start           = tcp_seq_start;
2367         afinfo->seq_ops.next            = tcp_seq_next;
2368         afinfo->seq_ops.stop            = tcp_seq_stop;
2369
2370         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2371                              &afinfo->seq_fops, afinfo);
2372         if (!p)
2373                 rc = -ENOMEM;
2374         return rc;
2375 }
2376 EXPORT_SYMBOL(tcp_proc_register);
2377
2378 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2379 {
2380         proc_net_remove(net, afinfo->name);
2381 }
2382 EXPORT_SYMBOL(tcp_proc_unregister);
2383
2384 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2385                          struct seq_file *f, int i, int uid, int *len)
2386 {
2387         const struct inet_request_sock *ireq = inet_rsk(req);
2388         int ttd = req->expires - jiffies;
2389
2390         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2391                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2392                 i,
2393                 ireq->loc_addr,
2394                 ntohs(inet_sk(sk)->inet_sport),
2395                 ireq->rmt_addr,
2396                 ntohs(ireq->rmt_port),
2397                 TCP_SYN_RECV,
2398                 0, 0, /* could print option size, but that is af dependent. */
2399                 1,    /* timers active (only the expire timer) */
2400                 jiffies_to_clock_t(ttd),
2401                 req->retrans,
2402                 uid,
2403                 0,  /* non standard timer */
2404                 0, /* open_requests have no inode */
2405                 atomic_read(&sk->sk_refcnt),
2406                 req,
2407                 len);
2408 }
2409
2410 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2411 {
2412         int timer_active;
2413         unsigned long timer_expires;
2414         const struct tcp_sock *tp = tcp_sk(sk);
2415         const struct inet_connection_sock *icsk = inet_csk(sk);
2416         const struct inet_sock *inet = inet_sk(sk);
2417         __be32 dest = inet->inet_daddr;
2418         __be32 src = inet->inet_rcv_saddr;
2419         __u16 destp = ntohs(inet->inet_dport);
2420         __u16 srcp = ntohs(inet->inet_sport);
2421         int rx_queue;
2422
2423         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2424                 timer_active    = 1;
2425                 timer_expires   = icsk->icsk_timeout;
2426         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2427                 timer_active    = 4;
2428                 timer_expires   = icsk->icsk_timeout;
2429         } else if (timer_pending(&sk->sk_timer)) {
2430                 timer_active    = 2;
2431                 timer_expires   = sk->sk_timer.expires;
2432         } else {
2433                 timer_active    = 0;
2434                 timer_expires = jiffies;
2435         }
2436
2437         if (sk->sk_state == TCP_LISTEN)
2438                 rx_queue = sk->sk_ack_backlog;
2439         else
2440                 /*
2441                  * because we dont lock socket, we might find a transient negative value
2442                  */
2443                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2444
2445         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2446                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2447                 i, src, srcp, dest, destp, sk->sk_state,
2448                 tp->write_seq - tp->snd_una,
2449                 rx_queue,
2450                 timer_active,
2451                 jiffies_to_clock_t(timer_expires - jiffies),
2452                 icsk->icsk_retransmits,
2453                 sock_i_uid(sk),
2454                 icsk->icsk_probes_out,
2455                 sock_i_ino(sk),
2456                 atomic_read(&sk->sk_refcnt), sk,
2457                 jiffies_to_clock_t(icsk->icsk_rto),
2458                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2459                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2460                 tp->snd_cwnd,
2461                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2462                 len);
2463 }
2464
2465 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2466                                struct seq_file *f, int i, int *len)
2467 {
2468         __be32 dest, src;
2469         __u16 destp, srcp;
2470         int ttd = tw->tw_ttd - jiffies;
2471
2472         if (ttd < 0)
2473                 ttd = 0;
2474
2475         dest  = tw->tw_daddr;
2476         src   = tw->tw_rcv_saddr;
2477         destp = ntohs(tw->tw_dport);
2478         srcp  = ntohs(tw->tw_sport);
2479
2480         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2481                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2482                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2483                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2484                 atomic_read(&tw->tw_refcnt), tw, len);
2485 }
2486
2487 #define TMPSZ 150
2488
2489 static int tcp4_seq_show(struct seq_file *seq, void *v)
2490 {
2491         struct tcp_iter_state *st;
2492         int len;
2493
2494         if (v == SEQ_START_TOKEN) {
2495                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2496                            "  sl  local_address rem_address   st tx_queue "
2497                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2498                            "inode");
2499                 goto out;
2500         }
2501         st = seq->private;
2502
2503         switch (st->state) {
2504         case TCP_SEQ_STATE_LISTENING:
2505         case TCP_SEQ_STATE_ESTABLISHED:
2506                 get_tcp4_sock(v, seq, st->num, &len);
2507                 break;
2508         case TCP_SEQ_STATE_OPENREQ:
2509                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2510                 break;
2511         case TCP_SEQ_STATE_TIME_WAIT:
2512                 get_timewait4_sock(v, seq, st->num, &len);
2513                 break;
2514         }
2515         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2516 out:
2517         return 0;
2518 }
2519
2520 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2521         .name           = "tcp",
2522         .family         = AF_INET,
2523         .seq_fops       = {
2524                 .owner          = THIS_MODULE,
2525         },
2526         .seq_ops        = {
2527                 .show           = tcp4_seq_show,
2528         },
2529 };
2530
2531 static int __net_init tcp4_proc_init_net(struct net *net)
2532 {
2533         return tcp_proc_register(net, &tcp4_seq_afinfo);
2534 }
2535
2536 static void __net_exit tcp4_proc_exit_net(struct net *net)
2537 {
2538         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2539 }
2540
2541 static struct pernet_operations tcp4_net_ops = {
2542         .init = tcp4_proc_init_net,
2543         .exit = tcp4_proc_exit_net,
2544 };
2545
2546 int __init tcp4_proc_init(void)
2547 {
2548         return register_pernet_subsys(&tcp4_net_ops);
2549 }
2550
2551 void tcp4_proc_exit(void)
2552 {
2553         unregister_pernet_subsys(&tcp4_net_ops);
2554 }
2555 #endif /* CONFIG_PROC_FS */
2556
2557 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2558 {
2559         const struct iphdr *iph = skb_gro_network_header(skb);
2560
2561         switch (skb->ip_summed) {
2562         case CHECKSUM_COMPLETE:
2563                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2564                                   skb->csum)) {
2565                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2566                         break;
2567                 }
2568
2569                 /* fall through */
2570         case CHECKSUM_NONE:
2571                 NAPI_GRO_CB(skb)->flush = 1;
2572                 return NULL;
2573         }
2574
2575         return tcp_gro_receive(head, skb);
2576 }
2577
2578 int tcp4_gro_complete(struct sk_buff *skb)
2579 {
2580         const struct iphdr *iph = ip_hdr(skb);
2581         struct tcphdr *th = tcp_hdr(skb);
2582
2583         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2584                                   iph->saddr, iph->daddr, 0);
2585         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2586
2587         return tcp_gro_complete(skb);
2588 }
2589
2590 struct proto tcp_prot = {
2591         .name                   = "TCP",
2592         .owner                  = THIS_MODULE,
2593         .close                  = tcp_close,
2594         .connect                = tcp_v4_connect,
2595         .disconnect             = tcp_disconnect,
2596         .accept                 = inet_csk_accept,
2597         .ioctl                  = tcp_ioctl,
2598         .init                   = tcp_v4_init_sock,
2599         .destroy                = tcp_v4_destroy_sock,
2600         .shutdown               = tcp_shutdown,
2601         .setsockopt             = tcp_setsockopt,
2602         .getsockopt             = tcp_getsockopt,
2603         .recvmsg                = tcp_recvmsg,
2604         .sendmsg                = tcp_sendmsg,
2605         .sendpage               = tcp_sendpage,
2606         .backlog_rcv            = tcp_v4_do_rcv,
2607         .hash                   = inet_hash,
2608         .unhash                 = inet_unhash,
2609         .get_port               = inet_csk_get_port,
2610         .enter_memory_pressure  = tcp_enter_memory_pressure,
2611         .sockets_allocated      = &tcp_sockets_allocated,
2612         .orphan_count           = &tcp_orphan_count,
2613         .memory_allocated       = &tcp_memory_allocated,
2614         .memory_pressure        = &tcp_memory_pressure,
2615         .sysctl_mem             = sysctl_tcp_mem,
2616         .sysctl_wmem            = sysctl_tcp_wmem,
2617         .sysctl_rmem            = sysctl_tcp_rmem,
2618         .max_header             = MAX_TCP_HEADER,
2619         .obj_size               = sizeof(struct tcp_sock),
2620         .slab_flags             = SLAB_DESTROY_BY_RCU,
2621         .twsk_prot              = &tcp_timewait_sock_ops,
2622         .rsk_prot               = &tcp_request_sock_ops,
2623         .h.hashinfo             = &tcp_hashinfo,
2624         .no_autobind            = true,
2625 #ifdef CONFIG_COMPAT
2626         .compat_setsockopt      = compat_tcp_setsockopt,
2627         .compat_getsockopt      = compat_tcp_getsockopt,
2628 #endif
2629 };
2630 EXPORT_SYMBOL(tcp_prot);
2631
2632
2633 static int __net_init tcp_sk_init(struct net *net)
2634 {
2635         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2636                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2637 }
2638
2639 static void __net_exit tcp_sk_exit(struct net *net)
2640 {
2641         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2642 }
2643
2644 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2645 {
2646         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2647 }
2648
2649 static struct pernet_operations __net_initdata tcp_sk_ops = {
2650        .init       = tcp_sk_init,
2651        .exit       = tcp_sk_exit,
2652        .exit_batch = tcp_sk_exit_batch,
2653 };
2654
2655 void __init tcp_v4_init(void)
2656 {
2657         inet_hashinfo_init(&tcp_hashinfo);
2658         if (register_pernet_subsys(&tcp_sk_ops))
2659                 panic("Failed to create the TCP control socket.\n");
2660 }