net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  88
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  92                                                    __be32 addr);
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  95 #else
  96 static inline
  97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  98 {
  99         return NULL;
 100 }
 101 #endif
 102
 103 struct inet_hashinfo tcp_hashinfo;
 104 EXPORT_SYMBOL(tcp_hashinfo);
 105
 106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 107 {
 108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 109                                           ip_hdr(skb)->saddr,
 110                                           tcp_hdr(skb)->dest,
 111                                           tcp_hdr(skb)->source);
 112 }
 113
 114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 115 {
 116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 117         struct tcp_sock *tp = tcp_sk(sk);
 118
 119         /* With PAWS, it is safe from the viewpoint
 120            of data integrity. Even without PAWS it is safe provided sequence
 121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 122
 123            Actually, the idea is close to VJ's one, only timestamp cache is
 124            held not per host, but per port pair and TW bucket is used as state
 125            holder.
 126
 127            If TW bucket has been already destroyed we fall back to VJ's scheme
 128            and use initial timestamp retrieved from peer table.
 129          */
 130         if (tcptw->tw_ts_recent_stamp &&
 131             (twp == NULL || (sysctl_tcp_tw_reuse &&
 132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 134                 if (tp->write_seq == 0)
 135                         tp->write_seq = 1;
 136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 138                 sock_hold(sktw);
 139                 return 1;
 140         }
 141
 142         return 0;
 143 }
 144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 145
 146 /* This will initiate an outgoing connection. */
 147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 148 {
 149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 150         struct inet_sock *inet = inet_sk(sk);
 151         struct tcp_sock *tp = tcp_sk(sk);
 152         __be16 orig_sport, orig_dport;
 153         __be32 daddr, nexthop;
 154         struct flowi4 fl4;
 155         struct rtable *rt;
 156         int err;
 157         struct ip_options_rcu *inet_opt;
 158
 159         if (addr_len < sizeof(struct sockaddr_in))
 160                 return -EINVAL;
 161
 162         if (usin->sin_family != AF_INET)
 163                 return -EAFNOSUPPORT;
 164
 165         nexthop = daddr = usin->sin_addr.s_addr;
 166         inet_opt = rcu_dereference_protected(inet->inet_opt,
 167                                              sock_owned_by_user(sk));
 168         if (inet_opt && inet_opt->opt.srr) {
 169                 if (!daddr)
 170                         return -EINVAL;
 171                 nexthop = inet_opt->opt.faddr;
 172         }
 173
 174         orig_sport = inet->inet_sport;
 175         orig_dport = usin->sin_port;
 176         rt = ip_route_connect(&fl4, nexthop, inet->inet_saddr,
 177                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 178                               IPPROTO_TCP,
 179                               orig_sport, orig_dport, sk, true);
 180         if (IS_ERR(rt)) {
 181                 err = PTR_ERR(rt);
 182                 if (err == -ENETUNREACH)
 183                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 184                 return err;
 185         }
 186
 187         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 188                 ip_rt_put(rt);
 189                 return -ENETUNREACH;
 190         }
 191
 192         if (!inet_opt || !inet_opt->opt.srr)
 193                 daddr = rt->rt_dst;
 194
 195         if (!inet->inet_saddr)
 196                 inet->inet_saddr = rt->rt_src;
 197         inet->inet_rcv_saddr = inet->inet_saddr;
 198
 199         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 200                 /* Reset inherited state */
 201                 tp->rx_opt.ts_recent       = 0;
 202                 tp->rx_opt.ts_recent_stamp = 0;
 203                 tp->write_seq              = 0;
 204         }
 205
 206         if (tcp_death_row.sysctl_tw_recycle &&
 207             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 208                 struct inet_peer *peer = rt_get_peer(rt);
 209                 /*
 210                  * VJ's idea. We save last timestamp seen from
 211                  * the destination in peer table, when entering state
 212                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 213                  * when trying new connection.
 214                  */
 215                 if (peer) {
 216                         inet_peer_refcheck(peer);
 217                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 218                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 219                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 220                         }
 221                 }
 222         }
 223
 224         inet->inet_dport = usin->sin_port;
 225         inet->inet_daddr = daddr;
 226
 227         inet_csk(sk)->icsk_ext_hdr_len = 0;
 228         if (inet_opt)
 229                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 230
 231         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 232
 233         /* Socket identity is still unknown (sport may be zero).
 234          * However we set state to SYN-SENT and not releasing socket
 235          * lock select source port, enter ourselves into the hash tables and
 236          * complete initialization after this.
 237          */
 238         tcp_set_state(sk, TCP_SYN_SENT);
 239         err = inet_hash_connect(&tcp_death_row, sk);
 240         if (err)
 241                 goto failure;
 242
 243         rt = ip_route_newports(&fl4, rt, orig_sport, orig_dport,
 244                                inet->inet_sport, inet->inet_dport, sk);
 245         if (IS_ERR(rt)) {
 246                 err = PTR_ERR(rt);
 247                 rt = NULL;
 248                 goto failure;
 249         }
 250         /* OK, now commit destination to socket.  */
 251         sk->sk_gso_type = SKB_GSO_TCPV4;
 252         sk_setup_caps(sk, &rt->dst);
 253
 254         if (!tp->write_seq)
 255                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 256                                                            inet->inet_daddr,
 257                                                            inet->inet_sport,
 258                                                            usin->sin_port);
 259
 260         inet->inet_id = tp->write_seq ^ jiffies;
 261
 262         err = tcp_connect(sk);
 263         rt = NULL;
 264         if (err)
 265                 goto failure;
 266
 267         return 0;
 268
 269 failure:
 270         /*
 271          * This unhashes the socket and releases the local port,
 272          * if necessary.
 273          */
 274         tcp_set_state(sk, TCP_CLOSE);
 275         ip_rt_put(rt);
 276         sk->sk_route_caps = 0;
 277         inet->inet_dport = 0;
 278         return err;
 279 }
 280 EXPORT_SYMBOL(tcp_v4_connect);
 281
 282 /*
 283  * This routine does path mtu discovery as defined in RFC1191.
 284  */
 285 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 286 {
 287         struct dst_entry *dst;
 288         struct inet_sock *inet = inet_sk(sk);
 289
 290         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 291          * send out by Linux are always <576bytes so they should go through
 292          * unfragmented).
 293          */
 294         if (sk->sk_state == TCP_LISTEN)
 295                 return;
 296
 297         /* We don't check in the destentry if pmtu discovery is forbidden
 298          * on this route. We just assume that no packet_to_big packets
 299          * are send back when pmtu discovery is not active.
 300          * There is a small race when the user changes this flag in the
 301          * route, but I think that's acceptable.
 302          */
 303         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 304                 return;
 305
 306         dst->ops->update_pmtu(dst, mtu);
 307
 308         /* Something is about to be wrong... Remember soft error
 309          * for the case, if this connection will not able to recover.
 310          */
 311         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 312                 sk->sk_err_soft = EMSGSIZE;
 313
 314         mtu = dst_mtu(dst);
 315
 316         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 317             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 318                 tcp_sync_mss(sk, mtu);
 319
 320                 /* Resend the TCP packet because it's
 321                  * clear that the old packet has been
 322                  * dropped. This is the new "fast" path mtu
 323                  * discovery.
 324                  */
 325                 tcp_simple_retransmit(sk);
 326         } /* else let the usual retransmit timer handle it */
 327 }
 328
 329 /*
 330  * This routine is called by the ICMP module when it gets some
 331  * sort of error condition.  If err < 0 then the socket should
 332  * be closed and the error returned to the user.  If err > 0
 333  * it's just the icmp type << 8 | icmp code.  After adjustment
 334  * header points to the first 8 bytes of the tcp header.  We need
 335  * to find the appropriate port.
 336  *
 337  * The locking strategy used here is very "optimistic". When
 338  * someone else accesses the socket the ICMP is just dropped
 339  * and for some paths there is no check at all.
 340  * A more general error queue to queue errors for later handling
 341  * is probably better.
 342  *
 343  */
 344
 345 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 346 {
 347         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 348         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 349         struct inet_connection_sock *icsk;
 350         struct tcp_sock *tp;
 351         struct inet_sock *inet;
 352         const int type = icmp_hdr(icmp_skb)->type;
 353         const int code = icmp_hdr(icmp_skb)->code;
 354         struct sock *sk;
 355         struct sk_buff *skb;
 356         __u32 seq;
 357         __u32 remaining;
 358         int err;
 359         struct net *net = dev_net(icmp_skb->dev);
 360
 361         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                 return;
 364         }
 365
 366         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                         iph->saddr, th->source, inet_iif(icmp_skb));
 368         if (!sk) {
 369                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                 return;
 371         }
 372         if (sk->sk_state == TCP_TIME_WAIT) {
 373                 inet_twsk_put(inet_twsk(sk));
 374                 return;
 375         }
 376
 377         bh_lock_sock(sk);
 378         /* If too many ICMPs get dropped on busy
 379          * servers this needs to be solved differently.
 380          */
 381         if (sock_owned_by_user(sk))
 382                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 383
 384         if (sk->sk_state == TCP_CLOSE)
 385                 goto out;
 386
 387         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 388                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 389                 goto out;
 390         }
 391
 392         icsk = inet_csk(sk);
 393         tp = tcp_sk(sk);
 394         seq = ntohl(th->seq);
 395         if (sk->sk_state != TCP_LISTEN &&
 396             !between(seq, tp->snd_una, tp->snd_nxt)) {
 397                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 398                 goto out;
 399         }
 400
 401         switch (type) {
 402         case ICMP_SOURCE_QUENCH:
 403                 /* Just silently ignore these. */
 404                 goto out;
 405         case ICMP_PARAMETERPROB:
 406                 err = EPROTO;
 407                 break;
 408         case ICMP_DEST_UNREACH:
 409                 if (code > NR_ICMP_UNREACH)
 410                         goto out;
 411
 412                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 413                         if (!sock_owned_by_user(sk))
 414                                 do_pmtu_discovery(sk, iph, info);
 415                         goto out;
 416                 }
 417
 418                 err = icmp_err_convert[code].errno;
 419                 /* check if icmp_skb allows revert of backoff
 420                  * (see draft-zimmermann-tcp-lcd) */
 421                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 422                         break;
 423                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 424                     !icsk->icsk_backoff)
 425                         break;
 426
 427                 if (sock_owned_by_user(sk))
 428                         break;
 429
 430                 icsk->icsk_backoff--;
 431                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 432                                          icsk->icsk_backoff;
 433                 tcp_bound_rto(sk);
 434
 435                 skb = tcp_write_queue_head(sk);
 436                 BUG_ON(!skb);
 437
 438                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 439                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 440
 441                 if (remaining) {
 442                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 443                                                   remaining, TCP_RTO_MAX);
 444                 } else {
 445                         /* RTO revert clocked out retransmission.
 446                          * Will retransmit now */
 447                         tcp_retransmit_timer(sk);
 448                 }
 449
 450                 break;
 451         case ICMP_TIME_EXCEEDED:
 452                 err = EHOSTUNREACH;
 453                 break;
 454         default:
 455                 goto out;
 456         }
 457
 458         switch (sk->sk_state) {
 459                 struct request_sock *req, **prev;
 460         case TCP_LISTEN:
 461                 if (sock_owned_by_user(sk))
 462                         goto out;
 463
 464                 req = inet_csk_search_req(sk, &prev, th->dest,
 465                                           iph->daddr, iph->saddr);
 466                 if (!req)
 467                         goto out;
 468
 469                 /* ICMPs are not backlogged, hence we cannot get
 470                    an established socket here.
 471                  */
 472                 WARN_ON(req->sk);
 473
 474                 if (seq != tcp_rsk(req)->snt_isn) {
 475                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 476                         goto out;
 477                 }
 478
 479                 /*
 480                  * Still in SYN_RECV, just remove it silently.
 481                  * There is no good way to pass the error to the newly
 482                  * created socket, and POSIX does not want network
 483                  * errors returned from accept().
 484                  */
 485                 inet_csk_reqsk_queue_drop(sk, req, prev);
 486                 goto out;
 487
 488         case TCP_SYN_SENT:
 489         case TCP_SYN_RECV:  /* Cannot happen.
 490                                It can f.e. if SYNs crossed.
 491                              */
 492                 if (!sock_owned_by_user(sk)) {
 493                         sk->sk_err = err;
 494
 495                         sk->sk_error_report(sk);
 496
 497                         tcp_done(sk);
 498                 } else {
 499                         sk->sk_err_soft = err;
 500                 }
 501                 goto out;
 502         }
 503
 504         /* If we've already connected we will keep trying
 505          * until we time out, or the user gives up.
 506          *
 507          * rfc1122 4.2.3.9 allows to consider as hard errors
 508          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 509          * but it is obsoleted by pmtu discovery).
 510          *
 511          * Note, that in modern internet, where routing is unreliable
 512          * and in each dark corner broken firewalls sit, sending random
 513          * errors ordered by their masters even this two messages finally lose
 514          * their original sense (even Linux sends invalid PORT_UNREACHs)
 515          *
 516          * Now we are in compliance with RFCs.
 517          *                                                      --ANK (980905)
 518          */
 519
 520         inet = inet_sk(sk);
 521         if (!sock_owned_by_user(sk) && inet->recverr) {
 522                 sk->sk_err = err;
 523                 sk->sk_error_report(sk);
 524         } else  { /* Only an error on timeout */
 525                 sk->sk_err_soft = err;
 526         }
 527
 528 out:
 529         bh_unlock_sock(sk);
 530         sock_put(sk);
 531 }
 532
 533 static void __tcp_v4_send_check(struct sk_buff *skb,
 534                                 __be32 saddr, __be32 daddr)
 535 {
 536         struct tcphdr *th = tcp_hdr(skb);
 537
 538         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 539                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 540                 skb->csum_start = skb_transport_header(skb) - skb->head;
 541                 skb->csum_offset = offsetof(struct tcphdr, check);
 542         } else {
 543                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 544                                          csum_partial(th,
 545                                                       th->doff << 2,
 546                                                       skb->csum));
 547         }
 548 }
 549
 550 /* This routine computes an IPv4 TCP checksum. */
 551 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 552 {
 553         struct inet_sock *inet = inet_sk(sk);
 554
 555         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 556 }
 557 EXPORT_SYMBOL(tcp_v4_send_check);
 558
 559 int tcp_v4_gso_send_check(struct sk_buff *skb)
 560 {
 561         const struct iphdr *iph;
 562         struct tcphdr *th;
 563
 564         if (!pskb_may_pull(skb, sizeof(*th)))
 565                 return -EINVAL;
 566
 567         iph = ip_hdr(skb);
 568         th = tcp_hdr(skb);
 569
 570         th->check = 0;
 571         skb->ip_summed = CHECKSUM_PARTIAL;
 572         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 573         return 0;
 574 }
 575
 576 /*
 577  *      This routine will send an RST to the other tcp.
 578  *
 579  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 580  *                    for reset.
 581  *      Answer: if a packet caused RST, it is not for a socket
 582  *              existing in our system, if it is matched to a socket,
 583  *              it is just duplicate segment or bug in other side's TCP.
 584  *              So that we build reply only basing on parameters
 585  *              arrived with segment.
 586  *      Exception: precedence violation. We do not implement it in any case.
 587  */
 588
 589 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 590 {
 591         struct tcphdr *th = tcp_hdr(skb);
 592         struct {
 593                 struct tcphdr th;
 594 #ifdef CONFIG_TCP_MD5SIG
 595                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 596 #endif
 597         } rep;
 598         struct ip_reply_arg arg;
 599 #ifdef CONFIG_TCP_MD5SIG
 600         struct tcp_md5sig_key *key;
 601 #endif
 602         struct net *net;
 603
 604         /* Never send a reset in response to a reset. */
 605         if (th->rst)
 606                 return;
 607
 608         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 609                 return;
 610
 611         /* Swap the send and the receive. */
 612         memset(&rep, 0, sizeof(rep));
 613         rep.th.dest   = th->source;
 614         rep.th.source = th->dest;
 615         rep.th.doff   = sizeof(struct tcphdr) / 4;
 616         rep.th.rst    = 1;
 617
 618         if (th->ack) {
 619                 rep.th.seq = th->ack_seq;
 620         } else {
 621                 rep.th.ack = 1;
 622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 623                                        skb->len - (th->doff << 2));
 624         }
 625
 626         memset(&arg, 0, sizeof(arg));
 627         arg.iov[0].iov_base = (unsigned char *)&rep;
 628         arg.iov[0].iov_len  = sizeof(rep.th);
 629
 630 #ifdef CONFIG_TCP_MD5SIG
 631         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 632         if (key) {
 633                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 634                                    (TCPOPT_NOP << 16) |
 635                                    (TCPOPT_MD5SIG << 8) |
 636                                    TCPOLEN_MD5SIG);
 637                 /* Update length and the length the header thinks exists */
 638                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 639                 rep.th.doff = arg.iov[0].iov_len / 4;
 640
 641                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 642                                      key, ip_hdr(skb)->saddr,
 643                                      ip_hdr(skb)->daddr, &rep.th);
 644         }
 645 #endif
 646         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 647                                       ip_hdr(skb)->saddr, /* XXX */
 648                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 649         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 650         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 651
 652         net = dev_net(skb_dst(skb)->dev);
 653         ip_send_reply(net->ipv4.tcp_sock, skb,
 654                       &arg, arg.iov[0].iov_len);
 655
 656         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 657         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 658 }
 659
 660 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 661    outside socket context is ugly, certainly. What can I do?
 662  */
 663
 664 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 665                             u32 win, u32 ts, int oif,
 666                             struct tcp_md5sig_key *key,
 667                             int reply_flags)
 668 {
 669         struct tcphdr *th = tcp_hdr(skb);
 670         struct {
 671                 struct tcphdr th;
 672                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 673 #ifdef CONFIG_TCP_MD5SIG
 674                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 675 #endif
 676                         ];
 677         } rep;
 678         struct ip_reply_arg arg;
 679         struct net *net = dev_net(skb_dst(skb)->dev);
 680
 681         memset(&rep.th, 0, sizeof(struct tcphdr));
 682         memset(&arg, 0, sizeof(arg));
 683
 684         arg.iov[0].iov_base = (unsigned char *)&rep;
 685         arg.iov[0].iov_len  = sizeof(rep.th);
 686         if (ts) {
 687                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 688                                    (TCPOPT_TIMESTAMP << 8) |
 689                                    TCPOLEN_TIMESTAMP);
 690                 rep.opt[1] = htonl(tcp_time_stamp);
 691                 rep.opt[2] = htonl(ts);
 692                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 693         }
 694
 695         /* Swap the send and the receive. */
 696         rep.th.dest    = th->source;
 697         rep.th.source  = th->dest;
 698         rep.th.doff    = arg.iov[0].iov_len / 4;
 699         rep.th.seq     = htonl(seq);
 700         rep.th.ack_seq = htonl(ack);
 701         rep.th.ack     = 1;
 702         rep.th.window  = htons(win);
 703
 704 #ifdef CONFIG_TCP_MD5SIG
 705         if (key) {
 706                 int offset = (ts) ? 3 : 0;
 707
 708                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 709                                           (TCPOPT_NOP << 16) |
 710                                           (TCPOPT_MD5SIG << 8) |
 711                                           TCPOLEN_MD5SIG);
 712                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 713                 rep.th.doff = arg.iov[0].iov_len/4;
 714
 715                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 716                                     key, ip_hdr(skb)->saddr,
 717                                     ip_hdr(skb)->daddr, &rep.th);
 718         }
 719 #endif
 720         arg.flags = reply_flags;
 721         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 722                                       ip_hdr(skb)->saddr, /* XXX */
 723                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 724         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 725         if (oif)
 726                 arg.bound_dev_if = oif;
 727
 728         ip_send_reply(net->ipv4.tcp_sock, skb,
 729                       &arg, arg.iov[0].iov_len);
 730
 731         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 732 }
 733
 734 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 735 {
 736         struct inet_timewait_sock *tw = inet_twsk(sk);
 737         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 738
 739         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 740                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 741                         tcptw->tw_ts_recent,
 742                         tw->tw_bound_dev_if,
 743                         tcp_twsk_md5_key(tcptw),
 744                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 745                         );
 746
 747         inet_twsk_put(tw);
 748 }
 749
 750 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 751                                   struct request_sock *req)
 752 {
 753         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 754                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 755                         req->ts_recent,
 756                         0,
 757                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 758                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 759 }
 760
 761 /*
 762  *      Send a SYN-ACK after having received a SYN.
 763  *      This still operates on a request_sock only, not on a big
 764  *      socket.
 765  */
 766 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 767                               struct request_sock *req,
 768                               struct request_values *rvp)
 769 {
 770         const struct inet_request_sock *ireq = inet_rsk(req);
 771         int err = -1;
 772         struct sk_buff * skb;
 773
 774         /* First, grab a route. */
 775         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 776                 return -1;
 777
 778         skb = tcp_make_synack(sk, dst, req, rvp);
 779
 780         if (skb) {
 781                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 782
 783                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 784                                             ireq->rmt_addr,
 785                                             ireq->opt);
 786                 err = net_xmit_eval(err);
 787         }
 788
 789         dst_release(dst);
 790         return err;
 791 }
 792
 793 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 794                               struct request_values *rvp)
 795 {
 796         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 797         return tcp_v4_send_synack(sk, NULL, req, rvp);
 798 }
 799
 800 /*
 801  *      IPv4 request_sock destructor.
 802  */
 803 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 804 {
 805         kfree(inet_rsk(req)->opt);
 806 }
 807
 808 static void syn_flood_warning(const struct sk_buff *skb)
 809 {
 810         const char *msg;
 811
 812 #ifdef CONFIG_SYN_COOKIES
 813         if (sysctl_tcp_syncookies)
 814                 msg = "Sending cookies";
 815         else
 816 #endif
 817                 msg = "Dropping request";
 818
 819         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 820                                 ntohs(tcp_hdr(skb)->dest), msg);
 821 }
 822
 823 /*
 824  * Save and compile IPv4 options into the request_sock if needed.
 825  */
 826 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 827                                                   struct sk_buff *skb)
 828 {
 829         const struct ip_options *opt = &(IPCB(skb)->opt);
 830         struct ip_options_rcu *dopt = NULL;
 831
 832         if (opt && opt->optlen) {
 833                 int opt_size = sizeof(*dopt) + opt->optlen;
 834
 835                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 836                 if (dopt) {
 837                         if (ip_options_echo(&dopt->opt, skb)) {
 838                                 kfree(dopt);
 839                                 dopt = NULL;
 840                         }
 841                 }
 842         }
 843         return dopt;
 844 }
 845
 846 #ifdef CONFIG_TCP_MD5SIG
 847 /*
 848  * RFC2385 MD5 checksumming requires a mapping of
 849  * IP address->MD5 Key.
 850  * We need to maintain these in the sk structure.
 851  */
 852
 853 /* Find the Key structure for an address.  */
 854 static struct tcp_md5sig_key *
 855                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 856 {
 857         struct tcp_sock *tp = tcp_sk(sk);
 858         int i;
 859
 860         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 861                 return NULL;
 862         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 863                 if (tp->md5sig_info->keys4[i].addr == addr)
 864                         return &tp->md5sig_info->keys4[i].base;
 865         }
 866         return NULL;
 867 }
 868
 869 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 870                                          struct sock *addr_sk)
 871 {
 872         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 873 }
 874 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 875
 876 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 877                                                       struct request_sock *req)
 878 {
 879         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 880 }
 881
 882 /* This can be called on a newly created socket, from other files */
 883 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 884                       u8 *newkey, u8 newkeylen)
 885 {
 886         /* Add Key to the list */
 887         struct tcp_md5sig_key *key;
 888         struct tcp_sock *tp = tcp_sk(sk);
 889         struct tcp4_md5sig_key *keys;
 890
 891         key = tcp_v4_md5_do_lookup(sk, addr);
 892         if (key) {
 893                 /* Pre-existing entry - just update that one. */
 894                 kfree(key->key);
 895                 key->key = newkey;
 896                 key->keylen = newkeylen;
 897         } else {
 898                 struct tcp_md5sig_info *md5sig;
 899
 900                 if (!tp->md5sig_info) {
 901                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 902                                                   GFP_ATOMIC);
 903                         if (!tp->md5sig_info) {
 904                                 kfree(newkey);
 905                                 return -ENOMEM;
 906                         }
 907                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 908                 }
 909                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 910                         kfree(newkey);
 911                         return -ENOMEM;
 912                 }
 913                 md5sig = tp->md5sig_info;
 914
 915                 if (md5sig->alloced4 == md5sig->entries4) {
 916                         keys = kmalloc((sizeof(*keys) *
 917                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 918                         if (!keys) {
 919                                 kfree(newkey);
 920                                 tcp_free_md5sig_pool();
 921                                 return -ENOMEM;
 922                         }
 923
 924                         if (md5sig->entries4)
 925                                 memcpy(keys, md5sig->keys4,
 926                                        sizeof(*keys) * md5sig->entries4);
 927
 928                         /* Free old key list, and reference new one */
 929                         kfree(md5sig->keys4);
 930                         md5sig->keys4 = keys;
 931                         md5sig->alloced4++;
 932                 }
 933                 md5sig->entries4++;
 934                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 935                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 936                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 937         }
 938         return 0;
 939 }
 940 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 941
 942 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 943                                u8 *newkey, u8 newkeylen)
 944 {
 945         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 946                                  newkey, newkeylen);
 947 }
 948
 949 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 950 {
 951         struct tcp_sock *tp = tcp_sk(sk);
 952         int i;
 953
 954         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 955                 if (tp->md5sig_info->keys4[i].addr == addr) {
 956                         /* Free the key */
 957                         kfree(tp->md5sig_info->keys4[i].base.key);
 958                         tp->md5sig_info->entries4--;
 959
 960                         if (tp->md5sig_info->entries4 == 0) {
 961                                 kfree(tp->md5sig_info->keys4);
 962                                 tp->md5sig_info->keys4 = NULL;
 963                                 tp->md5sig_info->alloced4 = 0;
 964                         } else if (tp->md5sig_info->entries4 != i) {
 965                                 /* Need to do some manipulation */
 966                                 memmove(&tp->md5sig_info->keys4[i],
 967                                         &tp->md5sig_info->keys4[i+1],
 968                                         (tp->md5sig_info->entries4 - i) *
 969                                          sizeof(struct tcp4_md5sig_key));
 970                         }
 971                         tcp_free_md5sig_pool();
 972                         return 0;
 973                 }
 974         }
 975         return -ENOENT;
 976 }
 977 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 978
 979 static void tcp_v4_clear_md5_list(struct sock *sk)
 980 {
 981         struct tcp_sock *tp = tcp_sk(sk);
 982
 983         /* Free each key, then the set of key keys,
 984          * the crypto element, and then decrement our
 985          * hold on the last resort crypto.
 986          */
 987         if (tp->md5sig_info->entries4) {
 988                 int i;
 989                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 990                         kfree(tp->md5sig_info->keys4[i].base.key);
 991                 tp->md5sig_info->entries4 = 0;
 992                 tcp_free_md5sig_pool();
 993         }
 994         if (tp->md5sig_info->keys4) {
 995                 kfree(tp->md5sig_info->keys4);
 996                 tp->md5sig_info->keys4 = NULL;
 997                 tp->md5sig_info->alloced4  = 0;
 998         }
 999 }
1000
1001 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1002                                  int optlen)
1003 {
1004         struct tcp_md5sig cmd;
1005         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1006         u8 *newkey;
1007
1008         if (optlen < sizeof(cmd))
1009                 return -EINVAL;
1010
1011         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1012                 return -EFAULT;
1013
1014         if (sin->sin_family != AF_INET)
1015                 return -EINVAL;
1016
1017         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1018                 if (!tcp_sk(sk)->md5sig_info)
1019                         return -ENOENT;
1020                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1021         }
1022
1023         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1024                 return -EINVAL;
1025
1026         if (!tcp_sk(sk)->md5sig_info) {
1027                 struct tcp_sock *tp = tcp_sk(sk);
1028                 struct tcp_md5sig_info *p;
1029
1030                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1031                 if (!p)
1032                         return -EINVAL;
1033
1034                 tp->md5sig_info = p;
1035                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1036         }
1037
1038         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1039         if (!newkey)
1040                 return -ENOMEM;
1041         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1042                                  newkey, cmd.tcpm_keylen);
1043 }
1044
1045 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1046                                         __be32 daddr, __be32 saddr, int nbytes)
1047 {
1048         struct tcp4_pseudohdr *bp;
1049         struct scatterlist sg;
1050
1051         bp = &hp->md5_blk.ip4;
1052
1053         /*
1054          * 1. the TCP pseudo-header (in the order: source IP address,
1055          * destination IP address, zero-padded protocol number, and
1056          * segment length)
1057          */
1058         bp->saddr = saddr;
1059         bp->daddr = daddr;
1060         bp->pad = 0;
1061         bp->protocol = IPPROTO_TCP;
1062         bp->len = cpu_to_be16(nbytes);
1063
1064         sg_init_one(&sg, bp, sizeof(*bp));
1065         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1066 }
1067
1068 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1069                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1070 {
1071         struct tcp_md5sig_pool *hp;
1072         struct hash_desc *desc;
1073
1074         hp = tcp_get_md5sig_pool();
1075         if (!hp)
1076                 goto clear_hash_noput;
1077         desc = &hp->md5_desc;
1078
1079         if (crypto_hash_init(desc))
1080                 goto clear_hash;
1081         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1082                 goto clear_hash;
1083         if (tcp_md5_hash_header(hp, th))
1084                 goto clear_hash;
1085         if (tcp_md5_hash_key(hp, key))
1086                 goto clear_hash;
1087         if (crypto_hash_final(desc, md5_hash))
1088                 goto clear_hash;
1089
1090         tcp_put_md5sig_pool();
1091         return 0;
1092
1093 clear_hash:
1094         tcp_put_md5sig_pool();
1095 clear_hash_noput:
1096         memset(md5_hash, 0, 16);
1097         return 1;
1098 }
1099
1100 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1101                         struct sock *sk, struct request_sock *req,
1102                         struct sk_buff *skb)
1103 {
1104         struct tcp_md5sig_pool *hp;
1105         struct hash_desc *desc;
1106         struct tcphdr *th = tcp_hdr(skb);
1107         __be32 saddr, daddr;
1108
1109         if (sk) {
1110                 saddr = inet_sk(sk)->inet_saddr;
1111                 daddr = inet_sk(sk)->inet_daddr;
1112         } else if (req) {
1113                 saddr = inet_rsk(req)->loc_addr;
1114                 daddr = inet_rsk(req)->rmt_addr;
1115         } else {
1116                 const struct iphdr *iph = ip_hdr(skb);
1117                 saddr = iph->saddr;
1118                 daddr = iph->daddr;
1119         }
1120
1121         hp = tcp_get_md5sig_pool();
1122         if (!hp)
1123                 goto clear_hash_noput;
1124         desc = &hp->md5_desc;
1125
1126         if (crypto_hash_init(desc))
1127                 goto clear_hash;
1128
1129         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1130                 goto clear_hash;
1131         if (tcp_md5_hash_header(hp, th))
1132                 goto clear_hash;
1133         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1134                 goto clear_hash;
1135         if (tcp_md5_hash_key(hp, key))
1136                 goto clear_hash;
1137         if (crypto_hash_final(desc, md5_hash))
1138                 goto clear_hash;
1139
1140         tcp_put_md5sig_pool();
1141         return 0;
1142
1143 clear_hash:
1144         tcp_put_md5sig_pool();
1145 clear_hash_noput:
1146         memset(md5_hash, 0, 16);
1147         return 1;
1148 }
1149 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1150
1151 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1152 {
1153         /*
1154          * This gets called for each TCP segment that arrives
1155          * so we want to be efficient.
1156          * We have 3 drop cases:
1157          * o No MD5 hash and one expected.
1158          * o MD5 hash and we're not expecting one.
1159          * o MD5 hash and its wrong.
1160          */
1161         __u8 *hash_location = NULL;
1162         struct tcp_md5sig_key *hash_expected;
1163         const struct iphdr *iph = ip_hdr(skb);
1164         struct tcphdr *th = tcp_hdr(skb);
1165         int genhash;
1166         unsigned char newhash[16];
1167
1168         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1169         hash_location = tcp_parse_md5sig_option(th);
1170
1171         /* We've parsed the options - do we have a hash? */
1172         if (!hash_expected && !hash_location)
1173                 return 0;
1174
1175         if (hash_expected && !hash_location) {
1176                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1177                 return 1;
1178         }
1179
1180         if (!hash_expected && hash_location) {
1181                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1182                 return 1;
1183         }
1184
1185         /* Okay, so this is hash_expected and hash_location -
1186          * so we need to calculate the checksum.
1187          */
1188         genhash = tcp_v4_md5_hash_skb(newhash,
1189                                       hash_expected,
1190                                       NULL, NULL, skb);
1191
1192         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1193                 if (net_ratelimit()) {
1194                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1195                                &iph->saddr, ntohs(th->source),
1196                                &iph->daddr, ntohs(th->dest),
1197                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1198                 }
1199                 return 1;
1200         }
1201         return 0;
1202 }
1203
1204 #endif
1205
1206 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1207         .family         =       PF_INET,
1208         .obj_size       =       sizeof(struct tcp_request_sock),
1209         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1210         .send_ack       =       tcp_v4_reqsk_send_ack,
1211         .destructor     =       tcp_v4_reqsk_destructor,
1212         .send_reset     =       tcp_v4_send_reset,
1213         .syn_ack_timeout =      tcp_syn_ack_timeout,
1214 };
1215
1216 #ifdef CONFIG_TCP_MD5SIG
1217 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1218         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1219         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1220 };
1221 #endif
1222
1223 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1224 {
1225         struct tcp_extend_values tmp_ext;
1226         struct tcp_options_received tmp_opt;
1227         u8 *hash_location;
1228         struct request_sock *req;
1229         struct inet_request_sock *ireq;
1230         struct tcp_sock *tp = tcp_sk(sk);
1231         struct dst_entry *dst = NULL;
1232         __be32 saddr = ip_hdr(skb)->saddr;
1233         __be32 daddr = ip_hdr(skb)->daddr;
1234         __u32 isn = TCP_SKB_CB(skb)->when;
1235 #ifdef CONFIG_SYN_COOKIES
1236         int want_cookie = 0;
1237 #else
1238 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1239 #endif
1240
1241         /* Never answer to SYNs send to broadcast or multicast */
1242         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243                 goto drop;
1244
1245         /* TW buckets are converted to open requests without
1246          * limitations, they conserve resources and peer is
1247          * evidently real one.
1248          */
1249         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1250                 if (net_ratelimit())
1251                         syn_flood_warning(skb);
1252 #ifdef CONFIG_SYN_COOKIES
1253                 if (sysctl_tcp_syncookies) {
1254                         want_cookie = 1;
1255                 } else
1256 #endif
1257                 goto drop;
1258         }
1259
1260         /* Accept backlog is full. If we have already queued enough
1261          * of warm entries in syn queue, drop request. It is better than
1262          * clogging syn queue with openreqs with exponentially increasing
1263          * timeout.
1264          */
1265         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1266                 goto drop;
1267
1268         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1269         if (!req)
1270                 goto drop;
1271
1272 #ifdef CONFIG_TCP_MD5SIG
1273         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1274 #endif
1275
1276         tcp_clear_options(&tmp_opt);
1277         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1278         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1279         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1280
1281         if (tmp_opt.cookie_plus > 0 &&
1282             tmp_opt.saw_tstamp &&
1283             !tp->rx_opt.cookie_out_never &&
1284             (sysctl_tcp_cookie_size > 0 ||
1285              (tp->cookie_values != NULL &&
1286               tp->cookie_values->cookie_desired > 0))) {
1287                 u8 *c;
1288                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1289                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1290
1291                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1292                         goto drop_and_release;
1293
1294                 /* Secret recipe starts with IP addresses */
1295                 *mess++ ^= (__force u32)daddr;
1296                 *mess++ ^= (__force u32)saddr;
1297
1298                 /* plus variable length Initiator Cookie */
1299                 c = (u8 *)mess;
1300                 while (l-- > 0)
1301                         *c++ ^= *hash_location++;
1302
1303 #ifdef CONFIG_SYN_COOKIES
1304                 want_cookie = 0;        /* not our kind of cookie */
1305 #endif
1306                 tmp_ext.cookie_out_never = 0; /* false */
1307                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1308         } else if (!tp->rx_opt.cookie_in_always) {
1309                 /* redundant indications, but ensure initialization. */
1310                 tmp_ext.cookie_out_never = 1; /* true */
1311                 tmp_ext.cookie_plus = 0;
1312         } else {
1313                 goto drop_and_release;
1314         }
1315         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1316
1317         if (want_cookie && !tmp_opt.saw_tstamp)
1318                 tcp_clear_options(&tmp_opt);
1319
1320         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1321         tcp_openreq_init(req, &tmp_opt, skb);
1322
1323         ireq = inet_rsk(req);
1324         ireq->loc_addr = daddr;
1325         ireq->rmt_addr = saddr;
1326         ireq->no_srccheck = inet_sk(sk)->transparent;
1327         ireq->opt = tcp_v4_save_options(sk, skb);
1328
1329         if (security_inet_conn_request(sk, skb, req))
1330                 goto drop_and_free;
1331
1332         if (!want_cookie || tmp_opt.tstamp_ok)
1333                 TCP_ECN_create_request(req, tcp_hdr(skb));
1334
1335         if (want_cookie) {
1336                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337                 req->cookie_ts = tmp_opt.tstamp_ok;
1338         } else if (!isn) {
1339                 struct inet_peer *peer = NULL;
1340
1341                 /* VJ's idea. We save last timestamp seen
1342                  * from the destination in peer table, when entering
1343                  * state TIME-WAIT, and check against it before
1344                  * accepting new connection request.
1345                  *
1346                  * If "isn" is not zero, this request hit alive
1347                  * timewait bucket, so that all the necessary checks
1348                  * are made in the function processing timewait state.
1349                  */
1350                 if (tmp_opt.saw_tstamp &&
1351                     tcp_death_row.sysctl_tw_recycle &&
1352                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1353                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1354                     peer->daddr.addr.a4 == saddr) {
1355                         inet_peer_refcheck(peer);
1356                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1357                             (s32)(peer->tcp_ts - req->ts_recent) >
1358                                                         TCP_PAWS_WINDOW) {
1359                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1360                                 goto drop_and_release;
1361                         }
1362                 }
1363                 /* Kill the following clause, if you dislike this way. */
1364                 else if (!sysctl_tcp_syncookies &&
1365                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1366                           (sysctl_max_syn_backlog >> 2)) &&
1367                          (!peer || !peer->tcp_ts_stamp) &&
1368                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1369                         /* Without syncookies last quarter of
1370                          * backlog is filled with destinations,
1371                          * proven to be alive.
1372                          * It means that we continue to communicate
1373                          * to destinations, already remembered
1374                          * to the moment of synflood.
1375                          */
1376                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1377                                        &saddr, ntohs(tcp_hdr(skb)->source));
1378                         goto drop_and_release;
1379                 }
1380
1381                 isn = tcp_v4_init_sequence(skb);
1382         }
1383         tcp_rsk(req)->snt_isn = isn;
1384
1385         if (tcp_v4_send_synack(sk, dst, req,
1386                                (struct request_values *)&tmp_ext) ||
1387             want_cookie)
1388                 goto drop_and_free;
1389
1390         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1391         return 0;
1392
1393 drop_and_release:
1394         dst_release(dst);
1395 drop_and_free:
1396         reqsk_free(req);
1397 drop:
1398         return 0;
1399 }
1400 EXPORT_SYMBOL(tcp_v4_conn_request);
1401
1402
1403 /*
1404  * The three way handshake has completed - we got a valid synack -
1405  * now create the new socket.
1406  */
1407 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1408                                   struct request_sock *req,
1409                                   struct dst_entry *dst)
1410 {
1411         struct inet_request_sock *ireq;
1412         struct inet_sock *newinet;
1413         struct tcp_sock *newtp;
1414         struct sock *newsk;
1415 #ifdef CONFIG_TCP_MD5SIG
1416         struct tcp_md5sig_key *key;
1417 #endif
1418         struct ip_options_rcu *inet_opt;
1419
1420         if (sk_acceptq_is_full(sk))
1421                 goto exit_overflow;
1422
1423         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1424                 goto exit;
1425
1426         newsk = tcp_create_openreq_child(sk, req, skb);
1427         if (!newsk)
1428                 goto exit_nonewsk;
1429
1430         newsk->sk_gso_type = SKB_GSO_TCPV4;
1431         sk_setup_caps(newsk, dst);
1432
1433         newtp                 = tcp_sk(newsk);
1434         newinet               = inet_sk(newsk);
1435         ireq                  = inet_rsk(req);
1436         newinet->inet_daddr   = ireq->rmt_addr;
1437         newinet->inet_rcv_saddr = ireq->loc_addr;
1438         newinet->inet_saddr           = ireq->loc_addr;
1439         inet_opt              = ireq->opt;
1440         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1441         ireq->opt             = NULL;
1442         newinet->mc_index     = inet_iif(skb);
1443         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1444         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1445         if (inet_opt)
1446                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1447         newinet->inet_id = newtp->write_seq ^ jiffies;
1448
1449         tcp_mtup_init(newsk);
1450         tcp_sync_mss(newsk, dst_mtu(dst));
1451         newtp->advmss = dst_metric_advmss(dst);
1452         if (tcp_sk(sk)->rx_opt.user_mss &&
1453             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1454                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1455
1456         tcp_initialize_rcv_mss(newsk);
1457
1458 #ifdef CONFIG_TCP_MD5SIG
1459         /* Copy over the MD5 key from the original socket */
1460         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1461         if (key != NULL) {
1462                 /*
1463                  * We're using one, so create a matching key
1464                  * on the newsk structure. If we fail to get
1465                  * memory, then we end up not copying the key
1466                  * across. Shucks.
1467                  */
1468                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1469                 if (newkey != NULL)
1470                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1471                                           newkey, key->keylen);
1472                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1473         }
1474 #endif
1475
1476         if (__inet_inherit_port(sk, newsk) < 0) {
1477                 sock_put(newsk);
1478                 goto exit;
1479         }
1480         __inet_hash_nolisten(newsk, NULL);
1481
1482         return newsk;
1483
1484 exit_overflow:
1485         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486 exit_nonewsk:
1487         dst_release(dst);
1488 exit:
1489         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1490         return NULL;
1491 }
1492 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1493
1494 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1495 {
1496         struct tcphdr *th = tcp_hdr(skb);
1497         const struct iphdr *iph = ip_hdr(skb);
1498         struct sock *nsk;
1499         struct request_sock **prev;
1500         /* Find possible connection requests. */
1501         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1502                                                        iph->saddr, iph->daddr);
1503         if (req)
1504                 return tcp_check_req(sk, skb, req, prev);
1505
1506         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1507                         th->source, iph->daddr, th->dest, inet_iif(skb));
1508
1509         if (nsk) {
1510                 if (nsk->sk_state != TCP_TIME_WAIT) {
1511                         bh_lock_sock(nsk);
1512                         return nsk;
1513                 }
1514                 inet_twsk_put(inet_twsk(nsk));
1515                 return NULL;
1516         }
1517
1518 #ifdef CONFIG_SYN_COOKIES
1519         if (!th->syn)
1520                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1521 #endif
1522         return sk;
1523 }
1524
1525 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1526 {
1527         const struct iphdr *iph = ip_hdr(skb);
1528
1529         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1530                 if (!tcp_v4_check(skb->len, iph->saddr,
1531                                   iph->daddr, skb->csum)) {
1532                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1533                         return 0;
1534                 }
1535         }
1536
1537         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1538                                        skb->len, IPPROTO_TCP, 0);
1539
1540         if (skb->len <= 76) {
1541                 return __skb_checksum_complete(skb);
1542         }
1543         return 0;
1544 }
1545
1546
1547 /* The socket must have it's spinlock held when we get
1548  * here.
1549  *
1550  * We have a potential double-lock case here, so even when
1551  * doing backlog processing we use the BH locking scheme.
1552  * This is because we cannot sleep with the original spinlock
1553  * held.
1554  */
1555 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556 {
1557         struct sock *rsk;
1558 #ifdef CONFIG_TCP_MD5SIG
1559         /*
1560          * We really want to reject the packet as early as possible
1561          * if:
1562          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1563          *  o There is an MD5 option and we're not expecting one
1564          */
1565         if (tcp_v4_inbound_md5_hash(sk, skb))
1566                 goto discard;
1567 #endif
1568
1569         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1570                 sock_rps_save_rxhash(sk, skb->rxhash);
1571                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1572                         rsk = sk;
1573                         goto reset;
1574                 }
1575                 return 0;
1576         }
1577
1578         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1579                 goto csum_err;
1580
1581         if (sk->sk_state == TCP_LISTEN) {
1582                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1583                 if (!nsk)
1584                         goto discard;
1585
1586                 if (nsk != sk) {
1587                         if (tcp_child_process(sk, nsk, skb)) {
1588                                 rsk = nsk;
1589                                 goto reset;
1590                         }
1591                         return 0;
1592                 }
1593         } else
1594                 sock_rps_save_rxhash(sk, skb->rxhash);
1595
1596         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1597                 rsk = sk;
1598                 goto reset;
1599         }
1600         return 0;
1601
1602 reset:
1603         tcp_v4_send_reset(rsk, skb);
1604 discard:
1605         kfree_skb(skb);
1606         /* Be careful here. If this function gets more complicated and
1607          * gcc suffers from register pressure on the x86, sk (in %ebx)
1608          * might be destroyed here. This current version compiles correctly,
1609          * but you have been warned.
1610          */
1611         return 0;
1612
1613 csum_err:
1614         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1615         goto discard;
1616 }
1617 EXPORT_SYMBOL(tcp_v4_do_rcv);
1618
1619 /*
1620  *      From tcp_input.c
1621  */
1622
1623 int tcp_v4_rcv(struct sk_buff *skb)
1624 {
1625         const struct iphdr *iph;
1626         struct tcphdr *th;
1627         struct sock *sk;
1628         int ret;
1629         struct net *net = dev_net(skb->dev);
1630
1631         if (skb->pkt_type != PACKET_HOST)
1632                 goto discard_it;
1633
1634         /* Count it even if it's bad */
1635         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1636
1637         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1638                 goto discard_it;
1639
1640         th = tcp_hdr(skb);
1641
1642         if (th->doff < sizeof(struct tcphdr) / 4)
1643                 goto bad_packet;
1644         if (!pskb_may_pull(skb, th->doff * 4))
1645                 goto discard_it;
1646
1647         /* An explanation is required here, I think.
1648          * Packet length and doff are validated by header prediction,
1649          * provided case of th->doff==0 is eliminated.
1650          * So, we defer the checks. */
1651         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1652                 goto bad_packet;
1653
1654         th = tcp_hdr(skb);
1655         iph = ip_hdr(skb);
1656         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1657         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1658                                     skb->len - th->doff * 4);
1659         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1660         TCP_SKB_CB(skb)->when    = 0;
1661         TCP_SKB_CB(skb)->flags   = iph->tos;
1662         TCP_SKB_CB(skb)->sacked  = 0;
1663
1664         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1665         if (!sk)
1666                 goto no_tcp_socket;
1667
1668 process:
1669         if (sk->sk_state == TCP_TIME_WAIT)
1670                 goto do_time_wait;
1671
1672         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1673                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1674                 goto discard_and_relse;
1675         }
1676
1677         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1678                 goto discard_and_relse;
1679         nf_reset(skb);
1680
1681         if (sk_filter(sk, skb))
1682                 goto discard_and_relse;
1683
1684         skb->dev = NULL;
1685
1686         bh_lock_sock_nested(sk);
1687         ret = 0;
1688         if (!sock_owned_by_user(sk)) {
1689 #ifdef CONFIG_NET_DMA
1690                 struct tcp_sock *tp = tcp_sk(sk);
1691                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1692                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1693                 if (tp->ucopy.dma_chan)
1694                         ret = tcp_v4_do_rcv(sk, skb);
1695                 else
1696 #endif
1697                 {
1698                         if (!tcp_prequeue(sk, skb))
1699                                 ret = tcp_v4_do_rcv(sk, skb);
1700                 }
1701         } else if (unlikely(sk_add_backlog(sk, skb))) {
1702                 bh_unlock_sock(sk);
1703                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1704                 goto discard_and_relse;
1705         }
1706         bh_unlock_sock(sk);
1707
1708         sock_put(sk);
1709
1710         return ret;
1711
1712 no_tcp_socket:
1713         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1714                 goto discard_it;
1715
1716         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1717 bad_packet:
1718                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1719         } else {
1720                 tcp_v4_send_reset(NULL, skb);
1721         }
1722
1723 discard_it:
1724         /* Discard frame. */
1725         kfree_skb(skb);
1726         return 0;
1727
1728 discard_and_relse:
1729         sock_put(sk);
1730         goto discard_it;
1731
1732 do_time_wait:
1733         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1734                 inet_twsk_put(inet_twsk(sk));
1735                 goto discard_it;
1736         }
1737
1738         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1739                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1740                 inet_twsk_put(inet_twsk(sk));
1741                 goto discard_it;
1742         }
1743         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1744         case TCP_TW_SYN: {
1745                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1746                                                         &tcp_hashinfo,
1747                                                         iph->daddr, th->dest,
1748                                                         inet_iif(skb));
1749                 if (sk2) {
1750                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1751                         inet_twsk_put(inet_twsk(sk));
1752                         sk = sk2;
1753                         goto process;
1754                 }
1755                 /* Fall through to ACK */
1756         }
1757         case TCP_TW_ACK:
1758                 tcp_v4_timewait_ack(sk, skb);
1759                 break;
1760         case TCP_TW_RST:
1761                 goto no_tcp_socket;
1762         case TCP_TW_SUCCESS:;
1763         }
1764         goto discard_it;
1765 }
1766
1767 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1768 {
1769         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1770         struct inet_sock *inet = inet_sk(sk);
1771         struct inet_peer *peer;
1772
1773         if (!rt || rt->rt_dst != inet->inet_daddr) {
1774                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1775                 *release_it = true;
1776         } else {
1777                 if (!rt->peer)
1778                         rt_bind_peer(rt, 1);
1779                 peer = rt->peer;
1780                 *release_it = false;
1781         }
1782
1783         return peer;
1784 }
1785 EXPORT_SYMBOL(tcp_v4_get_peer);
1786
1787 void *tcp_v4_tw_get_peer(struct sock *sk)
1788 {
1789         struct inet_timewait_sock *tw = inet_twsk(sk);
1790
1791         return inet_getpeer_v4(tw->tw_daddr, 1);
1792 }
1793 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1794
1795 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1796         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1797         .twsk_unique    = tcp_twsk_unique,
1798         .twsk_destructor= tcp_twsk_destructor,
1799         .twsk_getpeer   = tcp_v4_tw_get_peer,
1800 };
1801
1802 const struct inet_connection_sock_af_ops ipv4_specific = {
1803         .queue_xmit        = ip_queue_xmit,
1804         .send_check        = tcp_v4_send_check,
1805         .rebuild_header    = inet_sk_rebuild_header,
1806         .conn_request      = tcp_v4_conn_request,
1807         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1808         .get_peer          = tcp_v4_get_peer,
1809         .net_header_len    = sizeof(struct iphdr),
1810         .setsockopt        = ip_setsockopt,
1811         .getsockopt        = ip_getsockopt,
1812         .addr2sockaddr     = inet_csk_addr2sockaddr,
1813         .sockaddr_len      = sizeof(struct sockaddr_in),
1814         .bind_conflict     = inet_csk_bind_conflict,
1815 #ifdef CONFIG_COMPAT
1816         .compat_setsockopt = compat_ip_setsockopt,
1817         .compat_getsockopt = compat_ip_getsockopt,
1818 #endif
1819 };
1820 EXPORT_SYMBOL(ipv4_specific);
1821
1822 #ifdef CONFIG_TCP_MD5SIG
1823 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1824         .md5_lookup             = tcp_v4_md5_lookup,
1825         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1826         .md5_add                = tcp_v4_md5_add_func,
1827         .md5_parse              = tcp_v4_parse_md5_keys,
1828 };
1829 #endif
1830
1831 /* NOTE: A lot of things set to zero explicitly by call to
1832  *       sk_alloc() so need not be done here.
1833  */
1834 static int tcp_v4_init_sock(struct sock *sk)
1835 {
1836         struct inet_connection_sock *icsk = inet_csk(sk);
1837         struct tcp_sock *tp = tcp_sk(sk);
1838
1839         skb_queue_head_init(&tp->out_of_order_queue);
1840         tcp_init_xmit_timers(sk);
1841         tcp_prequeue_init(tp);
1842
1843         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1844         tp->mdev = TCP_TIMEOUT_INIT;
1845
1846         /* So many TCP implementations out there (incorrectly) count the
1847          * initial SYN frame in their delayed-ACK and congestion control
1848          * algorithms that we must have the following bandaid to talk
1849          * efficiently to them.  -DaveM
1850          */
1851         tp->snd_cwnd = 2;
1852
1853         /* See draft-stevens-tcpca-spec-01 for discussion of the
1854          * initialization of these values.
1855          */
1856         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1857         tp->snd_cwnd_clamp = ~0;
1858         tp->mss_cache = TCP_MSS_DEFAULT;
1859
1860         tp->reordering = sysctl_tcp_reordering;
1861         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1862
1863         sk->sk_state = TCP_CLOSE;
1864
1865         sk->sk_write_space = sk_stream_write_space;
1866         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1867
1868         icsk->icsk_af_ops = &ipv4_specific;
1869         icsk->icsk_sync_mss = tcp_sync_mss;
1870 #ifdef CONFIG_TCP_MD5SIG
1871         tp->af_specific = &tcp_sock_ipv4_specific;
1872 #endif
1873
1874         /* TCP Cookie Transactions */
1875         if (sysctl_tcp_cookie_size > 0) {
1876                 /* Default, cookies without s_data_payload. */
1877                 tp->cookie_values =
1878                         kzalloc(sizeof(*tp->cookie_values),
1879                                 sk->sk_allocation);
1880                 if (tp->cookie_values != NULL)
1881                         kref_init(&tp->cookie_values->kref);
1882         }
1883         /* Presumed zeroed, in order of appearance:
1884          *      cookie_in_always, cookie_out_never,
1885          *      s_data_constant, s_data_in, s_data_out
1886          */
1887         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1888         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1889
1890         local_bh_disable();
1891         percpu_counter_inc(&tcp_sockets_allocated);
1892         local_bh_enable();
1893
1894         return 0;
1895 }
1896
1897 void tcp_v4_destroy_sock(struct sock *sk)
1898 {
1899         struct tcp_sock *tp = tcp_sk(sk);
1900
1901         tcp_clear_xmit_timers(sk);
1902
1903         tcp_cleanup_congestion_control(sk);
1904
1905         /* Cleanup up the write buffer. */
1906         tcp_write_queue_purge(sk);
1907
1908         /* Cleans up our, hopefully empty, out_of_order_queue. */
1909         __skb_queue_purge(&tp->out_of_order_queue);
1910
1911 #ifdef CONFIG_TCP_MD5SIG
1912         /* Clean up the MD5 key list, if any */
1913         if (tp->md5sig_info) {
1914                 tcp_v4_clear_md5_list(sk);
1915                 kfree(tp->md5sig_info);
1916                 tp->md5sig_info = NULL;
1917         }
1918 #endif
1919
1920 #ifdef CONFIG_NET_DMA
1921         /* Cleans up our sk_async_wait_queue */
1922         __skb_queue_purge(&sk->sk_async_wait_queue);
1923 #endif
1924
1925         /* Clean prequeue, it must be empty really */
1926         __skb_queue_purge(&tp->ucopy.prequeue);
1927
1928         /* Clean up a referenced TCP bind bucket. */
1929         if (inet_csk(sk)->icsk_bind_hash)
1930                 inet_put_port(sk);
1931
1932         /*
1933          * If sendmsg cached page exists, toss it.
1934          */
1935         if (sk->sk_sndmsg_page) {
1936                 __free_page(sk->sk_sndmsg_page);
1937                 sk->sk_sndmsg_page = NULL;
1938         }
1939
1940         /* TCP Cookie Transactions */
1941         if (tp->cookie_values != NULL) {
1942                 kref_put(&tp->cookie_values->kref,
1943                          tcp_cookie_values_release);
1944                 tp->cookie_values = NULL;
1945         }
1946
1947         percpu_counter_dec(&tcp_sockets_allocated);
1948 }
1949 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1950
1951 #ifdef CONFIG_PROC_FS
1952 /* Proc filesystem TCP sock list dumping. */
1953
1954 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1955 {
1956         return hlist_nulls_empty(head) ? NULL :
1957                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1958 }
1959
1960 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1961 {
1962         return !is_a_nulls(tw->tw_node.next) ?
1963                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1964 }
1965
1966 /*
1967  * Get next listener socket follow cur.  If cur is NULL, get first socket
1968  * starting from bucket given in st->bucket; when st->bucket is zero the
1969  * very first socket in the hash table is returned.
1970  */
1971 static void *listening_get_next(struct seq_file *seq, void *cur)
1972 {
1973         struct inet_connection_sock *icsk;
1974         struct hlist_nulls_node *node;
1975         struct sock *sk = cur;
1976         struct inet_listen_hashbucket *ilb;
1977         struct tcp_iter_state *st = seq->private;
1978         struct net *net = seq_file_net(seq);
1979
1980         if (!sk) {
1981                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982                 spin_lock_bh(&ilb->lock);
1983                 sk = sk_nulls_head(&ilb->head);
1984                 st->offset = 0;
1985                 goto get_sk;
1986         }
1987         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1988         ++st->num;
1989         ++st->offset;
1990
1991         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1992                 struct request_sock *req = cur;
1993
1994                 icsk = inet_csk(st->syn_wait_sk);
1995                 req = req->dl_next;
1996                 while (1) {
1997                         while (req) {
1998                                 if (req->rsk_ops->family == st->family) {
1999                                         cur = req;
2000                                         goto out;
2001                                 }
2002                                 req = req->dl_next;
2003                         }
2004                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2005                                 break;
2006 get_req:
2007                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2008                 }
2009                 sk        = sk_nulls_next(st->syn_wait_sk);
2010                 st->state = TCP_SEQ_STATE_LISTENING;
2011                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012         } else {
2013                 icsk = inet_csk(sk);
2014                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2015                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2016                         goto start_req;
2017                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018                 sk = sk_nulls_next(sk);
2019         }
2020 get_sk:
2021         sk_nulls_for_each_from(sk, node) {
2022                 if (!net_eq(sock_net(sk), net))
2023                         continue;
2024                 if (sk->sk_family == st->family) {
2025                         cur = sk;
2026                         goto out;
2027                 }
2028                 icsk = inet_csk(sk);
2029                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2030                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2031 start_req:
2032                         st->uid         = sock_i_uid(sk);
2033                         st->syn_wait_sk = sk;
2034                         st->state       = TCP_SEQ_STATE_OPENREQ;
2035                         st->sbucket     = 0;
2036                         goto get_req;
2037                 }
2038                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039         }
2040         spin_unlock_bh(&ilb->lock);
2041         st->offset = 0;
2042         if (++st->bucket < INET_LHTABLE_SIZE) {
2043                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2044                 spin_lock_bh(&ilb->lock);
2045                 sk = sk_nulls_head(&ilb->head);
2046                 goto get_sk;
2047         }
2048         cur = NULL;
2049 out:
2050         return cur;
2051 }
2052
2053 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2054 {
2055         struct tcp_iter_state *st = seq->private;
2056         void *rc;
2057
2058         st->bucket = 0;
2059         st->offset = 0;
2060         rc = listening_get_next(seq, NULL);
2061
2062         while (rc && *pos) {
2063                 rc = listening_get_next(seq, rc);
2064                 --*pos;
2065         }
2066         return rc;
2067 }
2068
2069 static inline int empty_bucket(struct tcp_iter_state *st)
2070 {
2071         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2072                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2073 }
2074
2075 /*
2076  * Get first established socket starting from bucket given in st->bucket.
2077  * If st->bucket is zero, the very first socket in the hash is returned.
2078  */
2079 static void *established_get_first(struct seq_file *seq)
2080 {
2081         struct tcp_iter_state *st = seq->private;
2082         struct net *net = seq_file_net(seq);
2083         void *rc = NULL;
2084
2085         st->offset = 0;
2086         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2087                 struct sock *sk;
2088                 struct hlist_nulls_node *node;
2089                 struct inet_timewait_sock *tw;
2090                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2091
2092                 /* Lockless fast path for the common case of empty buckets */
2093                 if (empty_bucket(st))
2094                         continue;
2095
2096                 spin_lock_bh(lock);
2097                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2098                         if (sk->sk_family != st->family ||
2099                             !net_eq(sock_net(sk), net)) {
2100                                 continue;
2101                         }
2102                         rc = sk;
2103                         goto out;
2104                 }
2105                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2106                 inet_twsk_for_each(tw, node,
2107                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2108                         if (tw->tw_family != st->family ||
2109                             !net_eq(twsk_net(tw), net)) {
2110                                 continue;
2111                         }
2112                         rc = tw;
2113                         goto out;
2114                 }
2115                 spin_unlock_bh(lock);
2116                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2117         }
2118 out:
2119         return rc;
2120 }
2121
2122 static void *established_get_next(struct seq_file *seq, void *cur)
2123 {
2124         struct sock *sk = cur;
2125         struct inet_timewait_sock *tw;
2126         struct hlist_nulls_node *node;
2127         struct tcp_iter_state *st = seq->private;
2128         struct net *net = seq_file_net(seq);
2129
2130         ++st->num;
2131         ++st->offset;
2132
2133         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2134                 tw = cur;
2135                 tw = tw_next(tw);
2136 get_tw:
2137                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2138                         tw = tw_next(tw);
2139                 }
2140                 if (tw) {
2141                         cur = tw;
2142                         goto out;
2143                 }
2144                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2145                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146
2147                 /* Look for next non empty bucket */
2148                 st->offset = 0;
2149                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2150                                 empty_bucket(st))
2151                         ;
2152                 if (st->bucket > tcp_hashinfo.ehash_mask)
2153                         return NULL;
2154
2155                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2156                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2157         } else
2158                 sk = sk_nulls_next(sk);
2159
2160         sk_nulls_for_each_from(sk, node) {
2161                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2162                         goto found;
2163         }
2164
2165         st->state = TCP_SEQ_STATE_TIME_WAIT;
2166         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2167         goto get_tw;
2168 found:
2169         cur = sk;
2170 out:
2171         return cur;
2172 }
2173
2174 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2175 {
2176         struct tcp_iter_state *st = seq->private;
2177         void *rc;
2178
2179         st->bucket = 0;
2180         rc = established_get_first(seq);
2181
2182         while (rc && pos) {
2183                 rc = established_get_next(seq, rc);
2184                 --pos;
2185         }
2186         return rc;
2187 }
2188
2189 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2190 {
2191         void *rc;
2192         struct tcp_iter_state *st = seq->private;
2193
2194         st->state = TCP_SEQ_STATE_LISTENING;
2195         rc        = listening_get_idx(seq, &pos);
2196
2197         if (!rc) {
2198                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2199                 rc        = established_get_idx(seq, pos);
2200         }
2201
2202         return rc;
2203 }
2204
2205 static void *tcp_seek_last_pos(struct seq_file *seq)
2206 {
2207         struct tcp_iter_state *st = seq->private;
2208         int offset = st->offset;
2209         int orig_num = st->num;
2210         void *rc = NULL;
2211
2212         switch (st->state) {
2213         case TCP_SEQ_STATE_OPENREQ:
2214         case TCP_SEQ_STATE_LISTENING:
2215                 if (st->bucket >= INET_LHTABLE_SIZE)
2216                         break;
2217                 st->state = TCP_SEQ_STATE_LISTENING;
2218                 rc = listening_get_next(seq, NULL);
2219                 while (offset-- && rc)
2220                         rc = listening_get_next(seq, rc);
2221                 if (rc)
2222                         break;
2223                 st->bucket = 0;
2224                 /* Fallthrough */
2225         case TCP_SEQ_STATE_ESTABLISHED:
2226         case TCP_SEQ_STATE_TIME_WAIT:
2227                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2228                 if (st->bucket > tcp_hashinfo.ehash_mask)
2229                         break;
2230                 rc = established_get_first(seq);
2231                 while (offset-- && rc)
2232                         rc = established_get_next(seq, rc);
2233         }
2234
2235         st->num = orig_num;
2236
2237         return rc;
2238 }
2239
2240 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2241 {
2242         struct tcp_iter_state *st = seq->private;
2243         void *rc;
2244
2245         if (*pos && *pos == st->last_pos) {
2246                 rc = tcp_seek_last_pos(seq);
2247                 if (rc)
2248                         goto out;
2249         }
2250
2251         st->state = TCP_SEQ_STATE_LISTENING;
2252         st->num = 0;
2253         st->bucket = 0;
2254         st->offset = 0;
2255         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2256
2257 out:
2258         st->last_pos = *pos;
2259         return rc;
2260 }
2261
2262 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2263 {
2264         struct tcp_iter_state *st = seq->private;
2265         void *rc = NULL;
2266
2267         if (v == SEQ_START_TOKEN) {
2268                 rc = tcp_get_idx(seq, 0);
2269                 goto out;
2270         }
2271
2272         switch (st->state) {
2273         case TCP_SEQ_STATE_OPENREQ:
2274         case TCP_SEQ_STATE_LISTENING:
2275                 rc = listening_get_next(seq, v);
2276                 if (!rc) {
2277                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2278                         st->bucket = 0;
2279                         st->offset = 0;
2280                         rc        = established_get_first(seq);
2281                 }
2282                 break;
2283         case TCP_SEQ_STATE_ESTABLISHED:
2284         case TCP_SEQ_STATE_TIME_WAIT:
2285                 rc = established_get_next(seq, v);
2286                 break;
2287         }
2288 out:
2289         ++*pos;
2290         st->last_pos = *pos;
2291         return rc;
2292 }
2293
2294 static void tcp_seq_stop(struct seq_file *seq, void *v)
2295 {
2296         struct tcp_iter_state *st = seq->private;
2297
2298         switch (st->state) {
2299         case TCP_SEQ_STATE_OPENREQ:
2300                 if (v) {
2301                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2302                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2303                 }
2304         case TCP_SEQ_STATE_LISTENING:
2305                 if (v != SEQ_START_TOKEN)
2306                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2307                 break;
2308         case TCP_SEQ_STATE_TIME_WAIT:
2309         case TCP_SEQ_STATE_ESTABLISHED:
2310                 if (v)
2311                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2312                 break;
2313         }
2314 }
2315
2316 static int tcp_seq_open(struct inode *inode, struct file *file)
2317 {
2318         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2319         struct tcp_iter_state *s;
2320         int err;
2321
2322         err = seq_open_net(inode, file, &afinfo->seq_ops,
2323                           sizeof(struct tcp_iter_state));
2324         if (err < 0)
2325                 return err;
2326
2327         s = ((struct seq_file *)file->private_data)->private;
2328         s->family               = afinfo->family;
2329         s->last_pos             = 0;
2330         return 0;
2331 }
2332
2333 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2334 {
2335         int rc = 0;
2336         struct proc_dir_entry *p;
2337
2338         afinfo->seq_fops.open           = tcp_seq_open;
2339         afinfo->seq_fops.read           = seq_read;
2340         afinfo->seq_fops.llseek         = seq_lseek;
2341         afinfo->seq_fops.release        = seq_release_net;
2342
2343         afinfo->seq_ops.start           = tcp_seq_start;
2344         afinfo->seq_ops.next            = tcp_seq_next;
2345         afinfo->seq_ops.stop            = tcp_seq_stop;
2346
2347         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2348                              &afinfo->seq_fops, afinfo);
2349         if (!p)
2350                 rc = -ENOMEM;
2351         return rc;
2352 }
2353 EXPORT_SYMBOL(tcp_proc_register);
2354
2355 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2356 {
2357         proc_net_remove(net, afinfo->name);
2358 }
2359 EXPORT_SYMBOL(tcp_proc_unregister);
2360
2361 static void get_openreq4(struct sock *sk, struct request_sock *req,
2362                          struct seq_file *f, int i, int uid, int *len)
2363 {
2364         const struct inet_request_sock *ireq = inet_rsk(req);
2365         int ttd = req->expires - jiffies;
2366
2367         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2368                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2369                 i,
2370                 ireq->loc_addr,
2371                 ntohs(inet_sk(sk)->inet_sport),
2372                 ireq->rmt_addr,
2373                 ntohs(ireq->rmt_port),
2374                 TCP_SYN_RECV,
2375                 0, 0, /* could print option size, but that is af dependent. */
2376                 1,    /* timers active (only the expire timer) */
2377                 jiffies_to_clock_t(ttd),
2378                 req->retrans,
2379                 uid,
2380                 0,  /* non standard timer */
2381                 0, /* open_requests have no inode */
2382                 atomic_read(&sk->sk_refcnt),
2383                 req,
2384                 len);
2385 }
2386
2387 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2388 {
2389         int timer_active;
2390         unsigned long timer_expires;
2391         struct tcp_sock *tp = tcp_sk(sk);
2392         const struct inet_connection_sock *icsk = inet_csk(sk);
2393         struct inet_sock *inet = inet_sk(sk);
2394         __be32 dest = inet->inet_daddr;
2395         __be32 src = inet->inet_rcv_saddr;
2396         __u16 destp = ntohs(inet->inet_dport);
2397         __u16 srcp = ntohs(inet->inet_sport);
2398         int rx_queue;
2399
2400         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2401                 timer_active    = 1;
2402                 timer_expires   = icsk->icsk_timeout;
2403         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2404                 timer_active    = 4;
2405                 timer_expires   = icsk->icsk_timeout;
2406         } else if (timer_pending(&sk->sk_timer)) {
2407                 timer_active    = 2;
2408                 timer_expires   = sk->sk_timer.expires;
2409         } else {
2410                 timer_active    = 0;
2411                 timer_expires = jiffies;
2412         }
2413
2414         if (sk->sk_state == TCP_LISTEN)
2415                 rx_queue = sk->sk_ack_backlog;
2416         else
2417                 /*
2418                  * because we dont lock socket, we might find a transient negative value
2419                  */
2420                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2421
2422         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2423                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2424                 i, src, srcp, dest, destp, sk->sk_state,
2425                 tp->write_seq - tp->snd_una,
2426                 rx_queue,
2427                 timer_active,
2428                 jiffies_to_clock_t(timer_expires - jiffies),
2429                 icsk->icsk_retransmits,
2430                 sock_i_uid(sk),
2431                 icsk->icsk_probes_out,
2432                 sock_i_ino(sk),
2433                 atomic_read(&sk->sk_refcnt), sk,
2434                 jiffies_to_clock_t(icsk->icsk_rto),
2435                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2436                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2437                 tp->snd_cwnd,
2438                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2439                 len);
2440 }
2441
2442 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2443                                struct seq_file *f, int i, int *len)
2444 {
2445         __be32 dest, src;
2446         __u16 destp, srcp;
2447         int ttd = tw->tw_ttd - jiffies;
2448
2449         if (ttd < 0)
2450                 ttd = 0;
2451
2452         dest  = tw->tw_daddr;
2453         src   = tw->tw_rcv_saddr;
2454         destp = ntohs(tw->tw_dport);
2455         srcp  = ntohs(tw->tw_sport);
2456
2457         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2458                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2459                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2460                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2461                 atomic_read(&tw->tw_refcnt), tw, len);
2462 }
2463
2464 #define TMPSZ 150
2465
2466 static int tcp4_seq_show(struct seq_file *seq, void *v)
2467 {
2468         struct tcp_iter_state *st;
2469         int len;
2470
2471         if (v == SEQ_START_TOKEN) {
2472                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2473                            "  sl  local_address rem_address   st tx_queue "
2474                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2475                            "inode");
2476                 goto out;
2477         }
2478         st = seq->private;
2479
2480         switch (st->state) {
2481         case TCP_SEQ_STATE_LISTENING:
2482         case TCP_SEQ_STATE_ESTABLISHED:
2483                 get_tcp4_sock(v, seq, st->num, &len);
2484                 break;
2485         case TCP_SEQ_STATE_OPENREQ:
2486                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2487                 break;
2488         case TCP_SEQ_STATE_TIME_WAIT:
2489                 get_timewait4_sock(v, seq, st->num, &len);
2490                 break;
2491         }
2492         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2493 out:
2494         return 0;
2495 }
2496
2497 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2498         .name           = "tcp",
2499         .family         = AF_INET,
2500         .seq_fops       = {
2501                 .owner          = THIS_MODULE,
2502         },
2503         .seq_ops        = {
2504                 .show           = tcp4_seq_show,
2505         },
2506 };
2507
2508 static int __net_init tcp4_proc_init_net(struct net *net)
2509 {
2510         return tcp_proc_register(net, &tcp4_seq_afinfo);
2511 }
2512
2513 static void __net_exit tcp4_proc_exit_net(struct net *net)
2514 {
2515         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2516 }
2517
2518 static struct pernet_operations tcp4_net_ops = {
2519         .init = tcp4_proc_init_net,
2520         .exit = tcp4_proc_exit_net,
2521 };
2522
2523 int __init tcp4_proc_init(void)
2524 {
2525         return register_pernet_subsys(&tcp4_net_ops);
2526 }
2527
2528 void tcp4_proc_exit(void)
2529 {
2530         unregister_pernet_subsys(&tcp4_net_ops);
2531 }
2532 #endif /* CONFIG_PROC_FS */
2533
2534 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2535 {
2536         const struct iphdr *iph = skb_gro_network_header(skb);
2537
2538         switch (skb->ip_summed) {
2539         case CHECKSUM_COMPLETE:
2540                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2541                                   skb->csum)) {
2542                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2543                         break;
2544                 }
2545
2546                 /* fall through */
2547         case CHECKSUM_NONE:
2548                 NAPI_GRO_CB(skb)->flush = 1;
2549                 return NULL;
2550         }
2551
2552         return tcp_gro_receive(head, skb);
2553 }
2554
2555 int tcp4_gro_complete(struct sk_buff *skb)
2556 {
2557         const struct iphdr *iph = ip_hdr(skb);
2558         struct tcphdr *th = tcp_hdr(skb);
2559
2560         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2561                                   iph->saddr, iph->daddr, 0);
2562         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2563
2564         return tcp_gro_complete(skb);
2565 }
2566
2567 struct proto tcp_prot = {
2568         .name                   = "TCP",
2569         .owner                  = THIS_MODULE,
2570         .close                  = tcp_close,
2571         .connect                = tcp_v4_connect,
2572         .disconnect             = tcp_disconnect,
2573         .accept                 = inet_csk_accept,
2574         .ioctl                  = tcp_ioctl,
2575         .init                   = tcp_v4_init_sock,
2576         .destroy                = tcp_v4_destroy_sock,
2577         .shutdown               = tcp_shutdown,
2578         .setsockopt             = tcp_setsockopt,
2579         .getsockopt             = tcp_getsockopt,
2580         .recvmsg                = tcp_recvmsg,
2581         .sendmsg                = tcp_sendmsg,
2582         .sendpage               = tcp_sendpage,
2583         .backlog_rcv            = tcp_v4_do_rcv,
2584         .hash                   = inet_hash,
2585         .unhash                 = inet_unhash,
2586         .get_port               = inet_csk_get_port,
2587         .enter_memory_pressure  = tcp_enter_memory_pressure,
2588         .sockets_allocated      = &tcp_sockets_allocated,
2589         .orphan_count           = &tcp_orphan_count,
2590         .memory_allocated       = &tcp_memory_allocated,
2591         .memory_pressure        = &tcp_memory_pressure,
2592         .sysctl_mem             = sysctl_tcp_mem,
2593         .sysctl_wmem            = sysctl_tcp_wmem,
2594         .sysctl_rmem            = sysctl_tcp_rmem,
2595         .max_header             = MAX_TCP_HEADER,
2596         .obj_size               = sizeof(struct tcp_sock),
2597         .slab_flags             = SLAB_DESTROY_BY_RCU,
2598         .twsk_prot              = &tcp_timewait_sock_ops,
2599         .rsk_prot               = &tcp_request_sock_ops,
2600         .h.hashinfo             = &tcp_hashinfo,
2601         .no_autobind            = true,
2602 #ifdef CONFIG_COMPAT
2603         .compat_setsockopt      = compat_tcp_setsockopt,
2604         .compat_getsockopt      = compat_tcp_getsockopt,
2605 #endif
2606 };
2607 EXPORT_SYMBOL(tcp_prot);
2608
2609
2610 static int __net_init tcp_sk_init(struct net *net)
2611 {
2612         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2613                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2614 }
2615
2616 static void __net_exit tcp_sk_exit(struct net *net)
2617 {
2618         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2619 }
2620
2621 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2622 {
2623         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2624 }
2625
2626 static struct pernet_operations __net_initdata tcp_sk_ops = {
2627        .init       = tcp_sk_init,
2628        .exit       = tcp_sk_exit,
2629        .exit_batch = tcp_sk_exit_batch,
2630 };
2631
2632 void __init tcp_v4_init(void)
2633 {
2634         inet_hashinfo_init(&tcp_hashinfo);
2635         if (register_pernet_subsys(&tcp_sk_ops))
2636                 panic("Failed to create the TCP control socket.\n");
2637 }