net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  88
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  92                                                    __be32 addr);
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  95 #else
  96 static inline
  97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  98 {
  99         return NULL;
 100 }
 101 #endif
 102
 103 struct inet_hashinfo tcp_hashinfo;
 104 EXPORT_SYMBOL(tcp_hashinfo);
 105
 106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 107 {
 108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 109                                           ip_hdr(skb)->saddr,
 110                                           tcp_hdr(skb)->dest,
 111                                           tcp_hdr(skb)->source);
 112 }
 113
 114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 115 {
 116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 117         struct tcp_sock *tp = tcp_sk(sk);
 118
 119         /* With PAWS, it is safe from the viewpoint
 120            of data integrity. Even without PAWS it is safe provided sequence
 121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 122
 123            Actually, the idea is close to VJ's one, only timestamp cache is
 124            held not per host, but per port pair and TW bucket is used as state
 125            holder.
 126
 127            If TW bucket has been already destroyed we fall back to VJ's scheme
 128            and use initial timestamp retrieved from peer table.
 129          */
 130         if (tcptw->tw_ts_recent_stamp &&
 131             (twp == NULL || (sysctl_tcp_tw_reuse &&
 132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 134                 if (tp->write_seq == 0)
 135                         tp->write_seq = 1;
 136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 138                 sock_hold(sktw);
 139                 return 1;
 140         }
 141
 142         return 0;
 143 }
 144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 145
 146 /* This will initiate an outgoing connection. */
 147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 148 {
 149         struct inet_sock *inet = inet_sk(sk);
 150         struct tcp_sock *tp = tcp_sk(sk);
 151         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 152         struct rtable *rt;
 153         __be32 daddr, nexthop;
 154         int tmp;
 155         int err;
 156
 157         if (addr_len < sizeof(struct sockaddr_in))
 158                 return -EINVAL;
 159
 160         if (usin->sin_family != AF_INET)
 161                 return -EAFNOSUPPORT;
 162
 163         nexthop = daddr = usin->sin_addr.s_addr;
 164         if (inet->opt && inet->opt->srr) {
 165                 if (!daddr)
 166                         return -EINVAL;
 167                 nexthop = inet->opt->faddr;
 168         }
 169
 170         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 171                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                                IPPROTO_TCP,
 173                                inet->inet_sport, usin->sin_port, sk, 1);
 174         if (tmp < 0) {
 175                 if (tmp == -ENETUNREACH)
 176                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return tmp;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet->opt || !inet->opt->srr)
 186                 daddr = rt->rt_dst;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = rt->rt_src;
 190         inet->inet_rcv_saddr = inet->inet_saddr;
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 tp->write_seq              = 0;
 197         }
 198
 199         if (tcp_death_row.sysctl_tw_recycle &&
 200             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 201                 struct inet_peer *peer = rt_get_peer(rt);
 202                 /*
 203                  * VJ's idea. We save last timestamp seen from
 204                  * the destination in peer table, when entering state
 205                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 206                  * when trying new connection.
 207                  */
 208                 if (peer) {
 209                         inet_peer_refcheck(peer);
 210                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 211                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 212                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 213                         }
 214                 }
 215         }
 216
 217         inet->inet_dport = usin->sin_port;
 218         inet->inet_daddr = daddr;
 219
 220         inet_csk(sk)->icsk_ext_hdr_len = 0;
 221         if (inet->opt)
 222                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 223
 224         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 225
 226         /* Socket identity is still unknown (sport may be zero).
 227          * However we set state to SYN-SENT and not releasing socket
 228          * lock select source port, enter ourselves into the hash tables and
 229          * complete initialization after this.
 230          */
 231         tcp_set_state(sk, TCP_SYN_SENT);
 232         err = inet_hash_connect(&tcp_death_row, sk);
 233         if (err)
 234                 goto failure;
 235
 236         err = ip_route_newports(&rt, IPPROTO_TCP,
 237                                 inet->inet_sport, inet->inet_dport, sk);
 238         if (err)
 239                 goto failure;
 240
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq)
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         err = tcp_connect(sk);
 254         rt = NULL;
 255         if (err)
 256                 goto failure;
 257
 258         return 0;
 259
 260 failure:
 261         /*
 262          * This unhashes the socket and releases the local port,
 263          * if necessary.
 264          */
 265         tcp_set_state(sk, TCP_CLOSE);
 266         ip_rt_put(rt);
 267         sk->sk_route_caps = 0;
 268         inet->inet_dport = 0;
 269         return err;
 270 }
 271 EXPORT_SYMBOL(tcp_v4_connect);
 272
 273 /*
 274  * This routine does path mtu discovery as defined in RFC1191.
 275  */
 276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 277 {
 278         struct dst_entry *dst;
 279         struct inet_sock *inet = inet_sk(sk);
 280
 281         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 282          * send out by Linux are always <576bytes so they should go through
 283          * unfragmented).
 284          */
 285         if (sk->sk_state == TCP_LISTEN)
 286                 return;
 287
 288         /* We don't check in the destentry if pmtu discovery is forbidden
 289          * on this route. We just assume that no packet_to_big packets
 290          * are send back when pmtu discovery is not active.
 291          * There is a small race when the user changes this flag in the
 292          * route, but I think that's acceptable.
 293          */
 294         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 295                 return;
 296
 297         dst->ops->update_pmtu(dst, mtu);
 298
 299         /* Something is about to be wrong... Remember soft error
 300          * for the case, if this connection will not able to recover.
 301          */
 302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                 sk->sk_err_soft = EMSGSIZE;
 304
 305         mtu = dst_mtu(dst);
 306
 307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                 tcp_sync_mss(sk, mtu);
 310
 311                 /* Resend the TCP packet because it's
 312                  * clear that the old packet has been
 313                  * dropped. This is the new "fast" path mtu
 314                  * discovery.
 315                  */
 316                 tcp_simple_retransmit(sk);
 317         } /* else let the usual retransmit timer handle it */
 318 }
 319
 320 /*
 321  * This routine is called by the ICMP module when it gets some
 322  * sort of error condition.  If err < 0 then the socket should
 323  * be closed and the error returned to the user.  If err > 0
 324  * it's just the icmp type << 8 | icmp code.  After adjustment
 325  * header points to the first 8 bytes of the tcp header.  We need
 326  * to find the appropriate port.
 327  *
 328  * The locking strategy used here is very "optimistic". When
 329  * someone else accesses the socket the ICMP is just dropped
 330  * and for some paths there is no check at all.
 331  * A more general error queue to queue errors for later handling
 332  * is probably better.
 333  *
 334  */
 335
 336 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 337 {
 338         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 339         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 340         struct inet_connection_sock *icsk;
 341         struct tcp_sock *tp;
 342         struct inet_sock *inet;
 343         const int type = icmp_hdr(icmp_skb)->type;
 344         const int code = icmp_hdr(icmp_skb)->code;
 345         struct sock *sk;
 346         struct sk_buff *skb;
 347         __u32 seq;
 348         __u32 remaining;
 349         int err;
 350         struct net *net = dev_net(icmp_skb->dev);
 351
 352         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 353                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 354                 return;
 355         }
 356
 357         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 358                         iph->saddr, th->source, inet_iif(icmp_skb));
 359         if (!sk) {
 360                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 361                 return;
 362         }
 363         if (sk->sk_state == TCP_TIME_WAIT) {
 364                 inet_twsk_put(inet_twsk(sk));
 365                 return;
 366         }
 367
 368         bh_lock_sock(sk);
 369         /* If too many ICMPs get dropped on busy
 370          * servers this needs to be solved differently.
 371          */
 372         if (sock_owned_by_user(sk))
 373                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 374
 375         if (sk->sk_state == TCP_CLOSE)
 376                 goto out;
 377
 378         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 379                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 380                 goto out;
 381         }
 382
 383         icsk = inet_csk(sk);
 384         tp = tcp_sk(sk);
 385         seq = ntohl(th->seq);
 386         if (sk->sk_state != TCP_LISTEN &&
 387             !between(seq, tp->snd_una, tp->snd_nxt)) {
 388                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 389                 goto out;
 390         }
 391
 392         switch (type) {
 393         case ICMP_SOURCE_QUENCH:
 394                 /* Just silently ignore these. */
 395                 goto out;
 396         case ICMP_PARAMETERPROB:
 397                 err = EPROTO;
 398                 break;
 399         case ICMP_DEST_UNREACH:
 400                 if (code > NR_ICMP_UNREACH)
 401                         goto out;
 402
 403                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404                         if (!sock_owned_by_user(sk))
 405                                 do_pmtu_discovery(sk, iph, info);
 406                         goto out;
 407                 }
 408
 409                 err = icmp_err_convert[code].errno;
 410                 /* check if icmp_skb allows revert of backoff
 411                  * (see draft-zimmermann-tcp-lcd) */
 412                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 413                         break;
 414                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 415                     !icsk->icsk_backoff)
 416                         break;
 417
 418                 icsk->icsk_backoff--;
 419                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 420                                          icsk->icsk_backoff;
 421                 tcp_bound_rto(sk);
 422
 423                 skb = tcp_write_queue_head(sk);
 424                 BUG_ON(!skb);
 425
 426                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 427                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 428
 429                 if (remaining) {
 430                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 431                                                   remaining, TCP_RTO_MAX);
 432                 } else if (sock_owned_by_user(sk)) {
 433                         /* RTO revert clocked out retransmission,
 434                          * but socket is locked. Will defer. */
 435                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 436                                                   HZ/20, TCP_RTO_MAX);
 437                 } else {
 438                         /* RTO revert clocked out retransmission.
 439                          * Will retransmit now */
 440                         tcp_retransmit_timer(sk);
 441                 }
 442
 443                 break;
 444         case ICMP_TIME_EXCEEDED:
 445                 err = EHOSTUNREACH;
 446                 break;
 447         default:
 448                 goto out;
 449         }
 450
 451         switch (sk->sk_state) {
 452                 struct request_sock *req, **prev;
 453         case TCP_LISTEN:
 454                 if (sock_owned_by_user(sk))
 455                         goto out;
 456
 457                 req = inet_csk_search_req(sk, &prev, th->dest,
 458                                           iph->daddr, iph->saddr);
 459                 if (!req)
 460                         goto out;
 461
 462                 /* ICMPs are not backlogged, hence we cannot get
 463                    an established socket here.
 464                  */
 465                 WARN_ON(req->sk);
 466
 467                 if (seq != tcp_rsk(req)->snt_isn) {
 468                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 469                         goto out;
 470                 }
 471
 472                 /*
 473                  * Still in SYN_RECV, just remove it silently.
 474                  * There is no good way to pass the error to the newly
 475                  * created socket, and POSIX does not want network
 476                  * errors returned from accept().
 477                  */
 478                 inet_csk_reqsk_queue_drop(sk, req, prev);
 479                 goto out;
 480
 481         case TCP_SYN_SENT:
 482         case TCP_SYN_RECV:  /* Cannot happen.
 483                                It can f.e. if SYNs crossed.
 484                              */
 485                 if (!sock_owned_by_user(sk)) {
 486                         sk->sk_err = err;
 487
 488                         sk->sk_error_report(sk);
 489
 490                         tcp_done(sk);
 491                 } else {
 492                         sk->sk_err_soft = err;
 493                 }
 494                 goto out;
 495         }
 496
 497         /* If we've already connected we will keep trying
 498          * until we time out, or the user gives up.
 499          *
 500          * rfc1122 4.2.3.9 allows to consider as hard errors
 501          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 502          * but it is obsoleted by pmtu discovery).
 503          *
 504          * Note, that in modern internet, where routing is unreliable
 505          * and in each dark corner broken firewalls sit, sending random
 506          * errors ordered by their masters even this two messages finally lose
 507          * their original sense (even Linux sends invalid PORT_UNREACHs)
 508          *
 509          * Now we are in compliance with RFCs.
 510          *                                                      --ANK (980905)
 511          */
 512
 513         inet = inet_sk(sk);
 514         if (!sock_owned_by_user(sk) && inet->recverr) {
 515                 sk->sk_err = err;
 516                 sk->sk_error_report(sk);
 517         } else  { /* Only an error on timeout */
 518                 sk->sk_err_soft = err;
 519         }
 520
 521 out:
 522         bh_unlock_sock(sk);
 523         sock_put(sk);
 524 }
 525
 526 static void __tcp_v4_send_check(struct sk_buff *skb,
 527                                 __be32 saddr, __be32 daddr)
 528 {
 529         struct tcphdr *th = tcp_hdr(skb);
 530
 531         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 532                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 533                 skb->csum_start = skb_transport_header(skb) - skb->head;
 534                 skb->csum_offset = offsetof(struct tcphdr, check);
 535         } else {
 536                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 537                                          csum_partial(th,
 538                                                       th->doff << 2,
 539                                                       skb->csum));
 540         }
 541 }
 542
 543 /* This routine computes an IPv4 TCP checksum. */
 544 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 545 {
 546         struct inet_sock *inet = inet_sk(sk);
 547
 548         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 549 }
 550 EXPORT_SYMBOL(tcp_v4_send_check);
 551
 552 int tcp_v4_gso_send_check(struct sk_buff *skb)
 553 {
 554         const struct iphdr *iph;
 555         struct tcphdr *th;
 556
 557         if (!pskb_may_pull(skb, sizeof(*th)))
 558                 return -EINVAL;
 559
 560         iph = ip_hdr(skb);
 561         th = tcp_hdr(skb);
 562
 563         th->check = 0;
 564         skb->ip_summed = CHECKSUM_PARTIAL;
 565         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 566         return 0;
 567 }
 568
 569 /*
 570  *      This routine will send an RST to the other tcp.
 571  *
 572  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 573  *                    for reset.
 574  *      Answer: if a packet caused RST, it is not for a socket
 575  *              existing in our system, if it is matched to a socket,
 576  *              it is just duplicate segment or bug in other side's TCP.
 577  *              So that we build reply only basing on parameters
 578  *              arrived with segment.
 579  *      Exception: precedence violation. We do not implement it in any case.
 580  */
 581
 582 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 583 {
 584         struct tcphdr *th = tcp_hdr(skb);
 585         struct {
 586                 struct tcphdr th;
 587 #ifdef CONFIG_TCP_MD5SIG
 588                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 589 #endif
 590         } rep;
 591         struct ip_reply_arg arg;
 592 #ifdef CONFIG_TCP_MD5SIG
 593         struct tcp_md5sig_key *key;
 594 #endif
 595         struct net *net;
 596
 597         /* Never send a reset in response to a reset. */
 598         if (th->rst)
 599                 return;
 600
 601         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 602                 return;
 603
 604         /* Swap the send and the receive. */
 605         memset(&rep, 0, sizeof(rep));
 606         rep.th.dest   = th->source;
 607         rep.th.source = th->dest;
 608         rep.th.doff   = sizeof(struct tcphdr) / 4;
 609         rep.th.rst    = 1;
 610
 611         if (th->ack) {
 612                 rep.th.seq = th->ack_seq;
 613         } else {
 614                 rep.th.ack = 1;
 615                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 616                                        skb->len - (th->doff << 2));
 617         }
 618
 619         memset(&arg, 0, sizeof(arg));
 620         arg.iov[0].iov_base = (unsigned char *)&rep;
 621         arg.iov[0].iov_len  = sizeof(rep.th);
 622
 623 #ifdef CONFIG_TCP_MD5SIG
 624         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 625         if (key) {
 626                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 627                                    (TCPOPT_NOP << 16) |
 628                                    (TCPOPT_MD5SIG << 8) |
 629                                    TCPOLEN_MD5SIG);
 630                 /* Update length and the length the header thinks exists */
 631                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 632                 rep.th.doff = arg.iov[0].iov_len / 4;
 633
 634                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 635                                      key, ip_hdr(skb)->saddr,
 636                                      ip_hdr(skb)->daddr, &rep.th);
 637         }
 638 #endif
 639         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 640                                       ip_hdr(skb)->saddr, /* XXX */
 641                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 642         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 643         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 644
 645         net = dev_net(skb_dst(skb)->dev);
 646         ip_send_reply(net->ipv4.tcp_sock, skb,
 647                       &arg, arg.iov[0].iov_len);
 648
 649         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 650         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 651 }
 652
 653 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 654    outside socket context is ugly, certainly. What can I do?
 655  */
 656
 657 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 658                             u32 win, u32 ts, int oif,
 659                             struct tcp_md5sig_key *key,
 660                             int reply_flags)
 661 {
 662         struct tcphdr *th = tcp_hdr(skb);
 663         struct {
 664                 struct tcphdr th;
 665                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 666 #ifdef CONFIG_TCP_MD5SIG
 667                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 668 #endif
 669                         ];
 670         } rep;
 671         struct ip_reply_arg arg;
 672         struct net *net = dev_net(skb_dst(skb)->dev);
 673
 674         memset(&rep.th, 0, sizeof(struct tcphdr));
 675         memset(&arg, 0, sizeof(arg));
 676
 677         arg.iov[0].iov_base = (unsigned char *)&rep;
 678         arg.iov[0].iov_len  = sizeof(rep.th);
 679         if (ts) {
 680                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 681                                    (TCPOPT_TIMESTAMP << 8) |
 682                                    TCPOLEN_TIMESTAMP);
 683                 rep.opt[1] = htonl(tcp_time_stamp);
 684                 rep.opt[2] = htonl(ts);
 685                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 686         }
 687
 688         /* Swap the send and the receive. */
 689         rep.th.dest    = th->source;
 690         rep.th.source  = th->dest;
 691         rep.th.doff    = arg.iov[0].iov_len / 4;
 692         rep.th.seq     = htonl(seq);
 693         rep.th.ack_seq = htonl(ack);
 694         rep.th.ack     = 1;
 695         rep.th.window  = htons(win);
 696
 697 #ifdef CONFIG_TCP_MD5SIG
 698         if (key) {
 699                 int offset = (ts) ? 3 : 0;
 700
 701                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 702                                           (TCPOPT_NOP << 16) |
 703                                           (TCPOPT_MD5SIG << 8) |
 704                                           TCPOLEN_MD5SIG);
 705                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 706                 rep.th.doff = arg.iov[0].iov_len/4;
 707
 708                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 709                                     key, ip_hdr(skb)->saddr,
 710                                     ip_hdr(skb)->daddr, &rep.th);
 711         }
 712 #endif
 713         arg.flags = reply_flags;
 714         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 715                                       ip_hdr(skb)->saddr, /* XXX */
 716                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 717         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 718         if (oif)
 719                 arg.bound_dev_if = oif;
 720
 721         ip_send_reply(net->ipv4.tcp_sock, skb,
 722                       &arg, arg.iov[0].iov_len);
 723
 724         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 725 }
 726
 727 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 728 {
 729         struct inet_timewait_sock *tw = inet_twsk(sk);
 730         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 731
 732         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 733                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 734                         tcptw->tw_ts_recent,
 735                         tw->tw_bound_dev_if,
 736                         tcp_twsk_md5_key(tcptw),
 737                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 738                         );
 739
 740         inet_twsk_put(tw);
 741 }
 742
 743 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 744                                   struct request_sock *req)
 745 {
 746         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 747                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 748                         req->ts_recent,
 749                         0,
 750                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 751                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 752 }
 753
 754 /*
 755  *      Send a SYN-ACK after having received a SYN.
 756  *      This still operates on a request_sock only, not on a big
 757  *      socket.
 758  */
 759 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 760                               struct request_sock *req,
 761                               struct request_values *rvp)
 762 {
 763         const struct inet_request_sock *ireq = inet_rsk(req);
 764         int err = -1;
 765         struct sk_buff * skb;
 766
 767         /* First, grab a route. */
 768         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 769                 return -1;
 770
 771         skb = tcp_make_synack(sk, dst, req, rvp);
 772
 773         if (skb) {
 774                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 775
 776                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 777                                             ireq->rmt_addr,
 778                                             ireq->opt);
 779                 err = net_xmit_eval(err);
 780         }
 781
 782         dst_release(dst);
 783         return err;
 784 }
 785
 786 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 787                               struct request_values *rvp)
 788 {
 789         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 790         return tcp_v4_send_synack(sk, NULL, req, rvp);
 791 }
 792
 793 /*
 794  *      IPv4 request_sock destructor.
 795  */
 796 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 797 {
 798         kfree(inet_rsk(req)->opt);
 799 }
 800
 801 static void syn_flood_warning(const struct sk_buff *skb)
 802 {
 803         const char *msg;
 804
 805 #ifdef CONFIG_SYN_COOKIES
 806         if (sysctl_tcp_syncookies)
 807                 msg = "Sending cookies";
 808         else
 809 #endif
 810                 msg = "Dropping request";
 811
 812         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 813                                 ntohs(tcp_hdr(skb)->dest), msg);
 814 }
 815
 816 /*
 817  * Save and compile IPv4 options into the request_sock if needed.
 818  */
 819 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 820                                               struct sk_buff *skb)
 821 {
 822         struct ip_options *opt = &(IPCB(skb)->opt);
 823         struct ip_options *dopt = NULL;
 824
 825         if (opt && opt->optlen) {
 826                 int opt_size = optlength(opt);
 827                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 828                 if (dopt) {
 829                         if (ip_options_echo(dopt, skb)) {
 830                                 kfree(dopt);
 831                                 dopt = NULL;
 832                         }
 833                 }
 834         }
 835         return dopt;
 836 }
 837
 838 #ifdef CONFIG_TCP_MD5SIG
 839 /*
 840  * RFC2385 MD5 checksumming requires a mapping of
 841  * IP address->MD5 Key.
 842  * We need to maintain these in the sk structure.
 843  */
 844
 845 /* Find the Key structure for an address.  */
 846 static struct tcp_md5sig_key *
 847                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 848 {
 849         struct tcp_sock *tp = tcp_sk(sk);
 850         int i;
 851
 852         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 853                 return NULL;
 854         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 855                 if (tp->md5sig_info->keys4[i].addr == addr)
 856                         return &tp->md5sig_info->keys4[i].base;
 857         }
 858         return NULL;
 859 }
 860
 861 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 862                                          struct sock *addr_sk)
 863 {
 864         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 865 }
 866 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 867
 868 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 869                                                       struct request_sock *req)
 870 {
 871         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 872 }
 873
 874 /* This can be called on a newly created socket, from other files */
 875 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 876                       u8 *newkey, u8 newkeylen)
 877 {
 878         /* Add Key to the list */
 879         struct tcp_md5sig_key *key;
 880         struct tcp_sock *tp = tcp_sk(sk);
 881         struct tcp4_md5sig_key *keys;
 882
 883         key = tcp_v4_md5_do_lookup(sk, addr);
 884         if (key) {
 885                 /* Pre-existing entry - just update that one. */
 886                 kfree(key->key);
 887                 key->key = newkey;
 888                 key->keylen = newkeylen;
 889         } else {
 890                 struct tcp_md5sig_info *md5sig;
 891
 892                 if (!tp->md5sig_info) {
 893                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 894                                                   GFP_ATOMIC);
 895                         if (!tp->md5sig_info) {
 896                                 kfree(newkey);
 897                                 return -ENOMEM;
 898                         }
 899                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 900                 }
 901                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 902                         kfree(newkey);
 903                         return -ENOMEM;
 904                 }
 905                 md5sig = tp->md5sig_info;
 906
 907                 if (md5sig->alloced4 == md5sig->entries4) {
 908                         keys = kmalloc((sizeof(*keys) *
 909                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 910                         if (!keys) {
 911                                 kfree(newkey);
 912                                 tcp_free_md5sig_pool();
 913                                 return -ENOMEM;
 914                         }
 915
 916                         if (md5sig->entries4)
 917                                 memcpy(keys, md5sig->keys4,
 918                                        sizeof(*keys) * md5sig->entries4);
 919
 920                         /* Free old key list, and reference new one */
 921                         kfree(md5sig->keys4);
 922                         md5sig->keys4 = keys;
 923                         md5sig->alloced4++;
 924                 }
 925                 md5sig->entries4++;
 926                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 927                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 928                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 929         }
 930         return 0;
 931 }
 932 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 933
 934 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 935                                u8 *newkey, u8 newkeylen)
 936 {
 937         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 938                                  newkey, newkeylen);
 939 }
 940
 941 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 942 {
 943         struct tcp_sock *tp = tcp_sk(sk);
 944         int i;
 945
 946         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 947                 if (tp->md5sig_info->keys4[i].addr == addr) {
 948                         /* Free the key */
 949                         kfree(tp->md5sig_info->keys4[i].base.key);
 950                         tp->md5sig_info->entries4--;
 951
 952                         if (tp->md5sig_info->entries4 == 0) {
 953                                 kfree(tp->md5sig_info->keys4);
 954                                 tp->md5sig_info->keys4 = NULL;
 955                                 tp->md5sig_info->alloced4 = 0;
 956                         } else if (tp->md5sig_info->entries4 != i) {
 957                                 /* Need to do some manipulation */
 958                                 memmove(&tp->md5sig_info->keys4[i],
 959                                         &tp->md5sig_info->keys4[i+1],
 960                                         (tp->md5sig_info->entries4 - i) *
 961                                          sizeof(struct tcp4_md5sig_key));
 962                         }
 963                         tcp_free_md5sig_pool();
 964                         return 0;
 965                 }
 966         }
 967         return -ENOENT;
 968 }
 969 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 970
 971 static void tcp_v4_clear_md5_list(struct sock *sk)
 972 {
 973         struct tcp_sock *tp = tcp_sk(sk);
 974
 975         /* Free each key, then the set of key keys,
 976          * the crypto element, and then decrement our
 977          * hold on the last resort crypto.
 978          */
 979         if (tp->md5sig_info->entries4) {
 980                 int i;
 981                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 982                         kfree(tp->md5sig_info->keys4[i].base.key);
 983                 tp->md5sig_info->entries4 = 0;
 984                 tcp_free_md5sig_pool();
 985         }
 986         if (tp->md5sig_info->keys4) {
 987                 kfree(tp->md5sig_info->keys4);
 988                 tp->md5sig_info->keys4 = NULL;
 989                 tp->md5sig_info->alloced4  = 0;
 990         }
 991 }
 992
 993 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 994                                  int optlen)
 995 {
 996         struct tcp_md5sig cmd;
 997         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 998         u8 *newkey;
 999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1010                 if (!tcp_sk(sk)->md5sig_info)
1011                         return -ENOENT;
1012                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1013         }
1014
1015         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1016                 return -EINVAL;
1017
1018         if (!tcp_sk(sk)->md5sig_info) {
1019                 struct tcp_sock *tp = tcp_sk(sk);
1020                 struct tcp_md5sig_info *p;
1021
1022                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1023                 if (!p)
1024                         return -EINVAL;
1025
1026                 tp->md5sig_info = p;
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028         }
1029
1030         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1031         if (!newkey)
1032                 return -ENOMEM;
1033         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1034                                  newkey, cmd.tcpm_keylen);
1035 }
1036
1037 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1038                                         __be32 daddr, __be32 saddr, int nbytes)
1039 {
1040         struct tcp4_pseudohdr *bp;
1041         struct scatterlist sg;
1042
1043         bp = &hp->md5_blk.ip4;
1044
1045         /*
1046          * 1. the TCP pseudo-header (in the order: source IP address,
1047          * destination IP address, zero-padded protocol number, and
1048          * segment length)
1049          */
1050         bp->saddr = saddr;
1051         bp->daddr = daddr;
1052         bp->pad = 0;
1053         bp->protocol = IPPROTO_TCP;
1054         bp->len = cpu_to_be16(nbytes);
1055
1056         sg_init_one(&sg, bp, sizeof(*bp));
1057         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1058 }
1059
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1061                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1062 {
1063         struct tcp_md5sig_pool *hp;
1064         struct hash_desc *desc;
1065
1066         hp = tcp_get_md5sig_pool();
1067         if (!hp)
1068                 goto clear_hash_noput;
1069         desc = &hp->md5_desc;
1070
1071         if (crypto_hash_init(desc))
1072                 goto clear_hash;
1073         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_header(hp, th))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1093                         struct sock *sk, struct request_sock *req,
1094                         struct sk_buff *skb)
1095 {
1096         struct tcp_md5sig_pool *hp;
1097         struct hash_desc *desc;
1098         struct tcphdr *th = tcp_hdr(skb);
1099         __be32 saddr, daddr;
1100
1101         if (sk) {
1102                 saddr = inet_sk(sk)->inet_saddr;
1103                 daddr = inet_sk(sk)->inet_daddr;
1104         } else if (req) {
1105                 saddr = inet_rsk(req)->loc_addr;
1106                 daddr = inet_rsk(req)->rmt_addr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         desc = &hp->md5_desc;
1117
1118         if (crypto_hash_init(desc))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_header(hp, th))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126                 goto clear_hash;
1127         if (tcp_md5_hash_key(hp, key))
1128                 goto clear_hash;
1129         if (crypto_hash_final(desc, md5_hash))
1130                 goto clear_hash;
1131
1132         tcp_put_md5sig_pool();
1133         return 0;
1134
1135 clear_hash:
1136         tcp_put_md5sig_pool();
1137 clear_hash_noput:
1138         memset(md5_hash, 0, 16);
1139         return 1;
1140 }
1141 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1142
1143 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1144 {
1145         /*
1146          * This gets called for each TCP segment that arrives
1147          * so we want to be efficient.
1148          * We have 3 drop cases:
1149          * o No MD5 hash and one expected.
1150          * o MD5 hash and we're not expecting one.
1151          * o MD5 hash and its wrong.
1152          */
1153         __u8 *hash_location = NULL;
1154         struct tcp_md5sig_key *hash_expected;
1155         const struct iphdr *iph = ip_hdr(skb);
1156         struct tcphdr *th = tcp_hdr(skb);
1157         int genhash;
1158         unsigned char newhash[16];
1159
1160         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1161         hash_location = tcp_parse_md5sig_option(th);
1162
1163         /* We've parsed the options - do we have a hash? */
1164         if (!hash_expected && !hash_location)
1165                 return 0;
1166
1167         if (hash_expected && !hash_location) {
1168                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1169                 return 1;
1170         }
1171
1172         if (!hash_expected && hash_location) {
1173                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1174                 return 1;
1175         }
1176
1177         /* Okay, so this is hash_expected and hash_location -
1178          * so we need to calculate the checksum.
1179          */
1180         genhash = tcp_v4_md5_hash_skb(newhash,
1181                                       hash_expected,
1182                                       NULL, NULL, skb);
1183
1184         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1185                 if (net_ratelimit()) {
1186                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1187                                &iph->saddr, ntohs(th->source),
1188                                &iph->daddr, ntohs(th->dest),
1189                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1190                 }
1191                 return 1;
1192         }
1193         return 0;
1194 }
1195
1196 #endif
1197
1198 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1199         .family         =       PF_INET,
1200         .obj_size       =       sizeof(struct tcp_request_sock),
1201         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1202         .send_ack       =       tcp_v4_reqsk_send_ack,
1203         .destructor     =       tcp_v4_reqsk_destructor,
1204         .send_reset     =       tcp_v4_send_reset,
1205         .syn_ack_timeout =      tcp_syn_ack_timeout,
1206 };
1207
1208 #ifdef CONFIG_TCP_MD5SIG
1209 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1210         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1211         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1212 };
1213 #endif
1214
1215 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1216         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1217         .twsk_unique    = tcp_twsk_unique,
1218         .twsk_destructor= tcp_twsk_destructor,
1219 };
1220
1221 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222 {
1223         struct tcp_extend_values tmp_ext;
1224         struct tcp_options_received tmp_opt;
1225         u8 *hash_location;
1226         struct request_sock *req;
1227         struct inet_request_sock *ireq;
1228         struct tcp_sock *tp = tcp_sk(sk);
1229         struct dst_entry *dst = NULL;
1230         __be32 saddr = ip_hdr(skb)->saddr;
1231         __be32 daddr = ip_hdr(skb)->daddr;
1232         __u32 isn = TCP_SKB_CB(skb)->when;
1233 #ifdef CONFIG_SYN_COOKIES
1234         int want_cookie = 0;
1235 #else
1236 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1237 #endif
1238
1239         /* Never answer to SYNs send to broadcast or multicast */
1240         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241                 goto drop;
1242
1243         /* TW buckets are converted to open requests without
1244          * limitations, they conserve resources and peer is
1245          * evidently real one.
1246          */
1247         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1248                 if (net_ratelimit())
1249                         syn_flood_warning(skb);
1250 #ifdef CONFIG_SYN_COOKIES
1251                 if (sysctl_tcp_syncookies) {
1252                         want_cookie = 1;
1253                 } else
1254 #endif
1255                 goto drop;
1256         }
1257
1258         /* Accept backlog is full. If we have already queued enough
1259          * of warm entries in syn queue, drop request. It is better than
1260          * clogging syn queue with openreqs with exponentially increasing
1261          * timeout.
1262          */
1263         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1264                 goto drop;
1265
1266         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1267         if (!req)
1268                 goto drop;
1269
1270 #ifdef CONFIG_TCP_MD5SIG
1271         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1272 #endif
1273
1274         tcp_clear_options(&tmp_opt);
1275         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1276         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1277         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1278
1279         if (tmp_opt.cookie_plus > 0 &&
1280             tmp_opt.saw_tstamp &&
1281             !tp->rx_opt.cookie_out_never &&
1282             (sysctl_tcp_cookie_size > 0 ||
1283              (tp->cookie_values != NULL &&
1284               tp->cookie_values->cookie_desired > 0))) {
1285                 u8 *c;
1286                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1287                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1288
1289                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1290                         goto drop_and_release;
1291
1292                 /* Secret recipe starts with IP addresses */
1293                 *mess++ ^= (__force u32)daddr;
1294                 *mess++ ^= (__force u32)saddr;
1295
1296                 /* plus variable length Initiator Cookie */
1297                 c = (u8 *)mess;
1298                 while (l-- > 0)
1299                         *c++ ^= *hash_location++;
1300
1301 #ifdef CONFIG_SYN_COOKIES
1302                 want_cookie = 0;        /* not our kind of cookie */
1303 #endif
1304                 tmp_ext.cookie_out_never = 0; /* false */
1305                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1306         } else if (!tp->rx_opt.cookie_in_always) {
1307                 /* redundant indications, but ensure initialization. */
1308                 tmp_ext.cookie_out_never = 1; /* true */
1309                 tmp_ext.cookie_plus = 0;
1310         } else {
1311                 goto drop_and_release;
1312         }
1313         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1314
1315         if (want_cookie && !tmp_opt.saw_tstamp)
1316                 tcp_clear_options(&tmp_opt);
1317
1318         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1319         tcp_openreq_init(req, &tmp_opt, skb);
1320
1321         ireq = inet_rsk(req);
1322         ireq->loc_addr = daddr;
1323         ireq->rmt_addr = saddr;
1324         ireq->no_srccheck = inet_sk(sk)->transparent;
1325         ireq->opt = tcp_v4_save_options(sk, skb);
1326
1327         if (security_inet_conn_request(sk, skb, req))
1328                 goto drop_and_free;
1329
1330         if (!want_cookie || tmp_opt.tstamp_ok)
1331                 TCP_ECN_create_request(req, tcp_hdr(skb));
1332
1333         if (want_cookie) {
1334                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335                 req->cookie_ts = tmp_opt.tstamp_ok;
1336         } else if (!isn) {
1337                 struct inet_peer *peer = NULL;
1338
1339                 /* VJ's idea. We save last timestamp seen
1340                  * from the destination in peer table, when entering
1341                  * state TIME-WAIT, and check against it before
1342                  * accepting new connection request.
1343                  *
1344                  * If "isn" is not zero, this request hit alive
1345                  * timewait bucket, so that all the necessary checks
1346                  * are made in the function processing timewait state.
1347                  */
1348                 if (tmp_opt.saw_tstamp &&
1349                     tcp_death_row.sysctl_tw_recycle &&
1350                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1351                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352                     peer->v4daddr == saddr) {
1353                         inet_peer_refcheck(peer);
1354                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355                             (s32)(peer->tcp_ts - req->ts_recent) >
1356                                                         TCP_PAWS_WINDOW) {
1357                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1358                                 goto drop_and_release;
1359                         }
1360                 }
1361                 /* Kill the following clause, if you dislike this way. */
1362                 else if (!sysctl_tcp_syncookies &&
1363                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1364                           (sysctl_max_syn_backlog >> 2)) &&
1365                          (!peer || !peer->tcp_ts_stamp) &&
1366                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1367                         /* Without syncookies last quarter of
1368                          * backlog is filled with destinations,
1369                          * proven to be alive.
1370                          * It means that we continue to communicate
1371                          * to destinations, already remembered
1372                          * to the moment of synflood.
1373                          */
1374                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1375                                        &saddr, ntohs(tcp_hdr(skb)->source));
1376                         goto drop_and_release;
1377                 }
1378
1379                 isn = tcp_v4_init_sequence(skb);
1380         }
1381         tcp_rsk(req)->snt_isn = isn;
1382
1383         if (tcp_v4_send_synack(sk, dst, req,
1384                                (struct request_values *)&tmp_ext) ||
1385             want_cookie)
1386                 goto drop_and_free;
1387
1388         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1389         return 0;
1390
1391 drop_and_release:
1392         dst_release(dst);
1393 drop_and_free:
1394         reqsk_free(req);
1395 drop:
1396         return 0;
1397 }
1398 EXPORT_SYMBOL(tcp_v4_conn_request);
1399
1400
1401 /*
1402  * The three way handshake has completed - we got a valid synack -
1403  * now create the new socket.
1404  */
1405 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1406                                   struct request_sock *req,
1407                                   struct dst_entry *dst)
1408 {
1409         struct inet_request_sock *ireq;
1410         struct inet_sock *newinet;
1411         struct tcp_sock *newtp;
1412         struct sock *newsk;
1413 #ifdef CONFIG_TCP_MD5SIG
1414         struct tcp_md5sig_key *key;
1415 #endif
1416
1417         if (sk_acceptq_is_full(sk))
1418                 goto exit_overflow;
1419
1420         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1421                 goto exit;
1422
1423         newsk = tcp_create_openreq_child(sk, req, skb);
1424         if (!newsk)
1425                 goto exit;
1426
1427         newsk->sk_gso_type = SKB_GSO_TCPV4;
1428         sk_setup_caps(newsk, dst);
1429
1430         newtp                 = tcp_sk(newsk);
1431         newinet               = inet_sk(newsk);
1432         ireq                  = inet_rsk(req);
1433         newinet->inet_daddr   = ireq->rmt_addr;
1434         newinet->inet_rcv_saddr = ireq->loc_addr;
1435         newinet->inet_saddr           = ireq->loc_addr;
1436         newinet->opt          = ireq->opt;
1437         ireq->opt             = NULL;
1438         newinet->mc_index     = inet_iif(skb);
1439         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1440         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1441         if (newinet->opt)
1442                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1443         newinet->inet_id = newtp->write_seq ^ jiffies;
1444
1445         tcp_mtup_init(newsk);
1446         tcp_sync_mss(newsk, dst_mtu(dst));
1447         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1448         if (tcp_sk(sk)->rx_opt.user_mss &&
1449             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1450                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1451
1452         tcp_initialize_rcv_mss(newsk);
1453
1454 #ifdef CONFIG_TCP_MD5SIG
1455         /* Copy over the MD5 key from the original socket */
1456         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1457         if (key != NULL) {
1458                 /*
1459                  * We're using one, so create a matching key
1460                  * on the newsk structure. If we fail to get
1461                  * memory, then we end up not copying the key
1462                  * across. Shucks.
1463                  */
1464                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1465                 if (newkey != NULL)
1466                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1467                                           newkey, key->keylen);
1468                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1469         }
1470 #endif
1471
1472         __inet_hash_nolisten(newsk, NULL);
1473         __inet_inherit_port(sk, newsk);
1474
1475         return newsk;
1476
1477 exit_overflow:
1478         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1479 exit:
1480         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481         dst_release(dst);
1482         return NULL;
1483 }
1484 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1485
1486 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1487 {
1488         struct tcphdr *th = tcp_hdr(skb);
1489         const struct iphdr *iph = ip_hdr(skb);
1490         struct sock *nsk;
1491         struct request_sock **prev;
1492         /* Find possible connection requests. */
1493         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1494                                                        iph->saddr, iph->daddr);
1495         if (req)
1496                 return tcp_check_req(sk, skb, req, prev);
1497
1498         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1499                         th->source, iph->daddr, th->dest, inet_iif(skb));
1500
1501         if (nsk) {
1502                 if (nsk->sk_state != TCP_TIME_WAIT) {
1503                         bh_lock_sock(nsk);
1504                         return nsk;
1505                 }
1506                 inet_twsk_put(inet_twsk(nsk));
1507                 return NULL;
1508         }
1509
1510 #ifdef CONFIG_SYN_COOKIES
1511         if (!th->syn)
1512                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1513 #endif
1514         return sk;
1515 }
1516
1517 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1518 {
1519         const struct iphdr *iph = ip_hdr(skb);
1520
1521         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1522                 if (!tcp_v4_check(skb->len, iph->saddr,
1523                                   iph->daddr, skb->csum)) {
1524                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1525                         return 0;
1526                 }
1527         }
1528
1529         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1530                                        skb->len, IPPROTO_TCP, 0);
1531
1532         if (skb->len <= 76) {
1533                 return __skb_checksum_complete(skb);
1534         }
1535         return 0;
1536 }
1537
1538
1539 /* The socket must have it's spinlock held when we get
1540  * here.
1541  *
1542  * We have a potential double-lock case here, so even when
1543  * doing backlog processing we use the BH locking scheme.
1544  * This is because we cannot sleep with the original spinlock
1545  * held.
1546  */
1547 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1548 {
1549         struct sock *rsk;
1550 #ifdef CONFIG_TCP_MD5SIG
1551         /*
1552          * We really want to reject the packet as early as possible
1553          * if:
1554          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1555          *  o There is an MD5 option and we're not expecting one
1556          */
1557         if (tcp_v4_inbound_md5_hash(sk, skb))
1558                 goto discard;
1559 #endif
1560
1561         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562                 sock_rps_save_rxhash(sk, skb->rxhash);
1563                 TCP_CHECK_TIMER(sk);
1564                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1565                         rsk = sk;
1566                         goto reset;
1567                 }
1568                 TCP_CHECK_TIMER(sk);
1569                 return 0;
1570         }
1571
1572         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1573                 goto csum_err;
1574
1575         if (sk->sk_state == TCP_LISTEN) {
1576                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577                 if (!nsk)
1578                         goto discard;
1579
1580                 if (nsk != sk) {
1581                         if (tcp_child_process(sk, nsk, skb)) {
1582                                 rsk = nsk;
1583                                 goto reset;
1584                         }
1585                         return 0;
1586                 }
1587         } else
1588                 sock_rps_save_rxhash(sk, skb->rxhash);
1589
1590
1591         TCP_CHECK_TIMER(sk);
1592         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1593                 rsk = sk;
1594                 goto reset;
1595         }
1596         TCP_CHECK_TIMER(sk);
1597         return 0;
1598
1599 reset:
1600         tcp_v4_send_reset(rsk, skb);
1601 discard:
1602         kfree_skb(skb);
1603         /* Be careful here. If this function gets more complicated and
1604          * gcc suffers from register pressure on the x86, sk (in %ebx)
1605          * might be destroyed here. This current version compiles correctly,
1606          * but you have been warned.
1607          */
1608         return 0;
1609
1610 csum_err:
1611         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1612         goto discard;
1613 }
1614 EXPORT_SYMBOL(tcp_v4_do_rcv);
1615
1616 /*
1617  *      From tcp_input.c
1618  */
1619
1620 int tcp_v4_rcv(struct sk_buff *skb)
1621 {
1622         const struct iphdr *iph;
1623         struct tcphdr *th;
1624         struct sock *sk;
1625         int ret;
1626         struct net *net = dev_net(skb->dev);
1627
1628         if (skb->pkt_type != PACKET_HOST)
1629                 goto discard_it;
1630
1631         /* Count it even if it's bad */
1632         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1633
1634         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635                 goto discard_it;
1636
1637         th = tcp_hdr(skb);
1638
1639         if (th->doff < sizeof(struct tcphdr) / 4)
1640                 goto bad_packet;
1641         if (!pskb_may_pull(skb, th->doff * 4))
1642                 goto discard_it;
1643
1644         /* An explanation is required here, I think.
1645          * Packet length and doff are validated by header prediction,
1646          * provided case of th->doff==0 is eliminated.
1647          * So, we defer the checks. */
1648         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1649                 goto bad_packet;
1650
1651         th = tcp_hdr(skb);
1652         iph = ip_hdr(skb);
1653         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1654         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1655                                     skb->len - th->doff * 4);
1656         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1657         TCP_SKB_CB(skb)->when    = 0;
1658         TCP_SKB_CB(skb)->flags   = iph->tos;
1659         TCP_SKB_CB(skb)->sacked  = 0;
1660
1661         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1662         if (!sk)
1663                 goto no_tcp_socket;
1664
1665 process:
1666         if (sk->sk_state == TCP_TIME_WAIT)
1667                 goto do_time_wait;
1668
1669         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1670                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1671                 goto discard_and_relse;
1672         }
1673
1674         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1675                 goto discard_and_relse;
1676         nf_reset(skb);
1677
1678         if (sk_filter(sk, skb))
1679                 goto discard_and_relse;
1680
1681         skb->dev = NULL;
1682
1683         bh_lock_sock_nested(sk);
1684         ret = 0;
1685         if (!sock_owned_by_user(sk)) {
1686 #ifdef CONFIG_NET_DMA
1687                 struct tcp_sock *tp = tcp_sk(sk);
1688                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1689                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1690                 if (tp->ucopy.dma_chan)
1691                         ret = tcp_v4_do_rcv(sk, skb);
1692                 else
1693 #endif
1694                 {
1695                         if (!tcp_prequeue(sk, skb))
1696                                 ret = tcp_v4_do_rcv(sk, skb);
1697                 }
1698         } else if (unlikely(sk_add_backlog(sk, skb))) {
1699                 bh_unlock_sock(sk);
1700                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1701                 goto discard_and_relse;
1702         }
1703         bh_unlock_sock(sk);
1704
1705         sock_put(sk);
1706
1707         return ret;
1708
1709 no_tcp_socket:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1711                 goto discard_it;
1712
1713         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1714 bad_packet:
1715                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1716         } else {
1717                 tcp_v4_send_reset(NULL, skb);
1718         }
1719
1720 discard_it:
1721         /* Discard frame. */
1722         kfree_skb(skb);
1723         return 0;
1724
1725 discard_and_relse:
1726         sock_put(sk);
1727         goto discard_it;
1728
1729 do_time_wait:
1730         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1731                 inet_twsk_put(inet_twsk(sk));
1732                 goto discard_it;
1733         }
1734
1735         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1737                 inet_twsk_put(inet_twsk(sk));
1738                 goto discard_it;
1739         }
1740         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1741         case TCP_TW_SYN: {
1742                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1743                                                         &tcp_hashinfo,
1744                                                         iph->daddr, th->dest,
1745                                                         inet_iif(skb));
1746                 if (sk2) {
1747                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1748                         inet_twsk_put(inet_twsk(sk));
1749                         sk = sk2;
1750                         goto process;
1751                 }
1752                 /* Fall through to ACK */
1753         }
1754         case TCP_TW_ACK:
1755                 tcp_v4_timewait_ack(sk, skb);
1756                 break;
1757         case TCP_TW_RST:
1758                 goto no_tcp_socket;
1759         case TCP_TW_SUCCESS:;
1760         }
1761         goto discard_it;
1762 }
1763
1764 /* VJ's idea. Save last timestamp seen from this destination
1765  * and hold it at least for normal timewait interval to use for duplicate
1766  * segment detection in subsequent connections, before they enter synchronized
1767  * state.
1768  */
1769
1770 int tcp_v4_remember_stamp(struct sock *sk)
1771 {
1772         struct inet_sock *inet = inet_sk(sk);
1773         struct tcp_sock *tp = tcp_sk(sk);
1774         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1775         struct inet_peer *peer = NULL;
1776         int release_it = 0;
1777
1778         if (!rt || rt->rt_dst != inet->inet_daddr) {
1779                 peer = inet_getpeer(inet->inet_daddr, 1);
1780                 release_it = 1;
1781         } else {
1782                 if (!rt->peer)
1783                         rt_bind_peer(rt, 1);
1784                 peer = rt->peer;
1785         }
1786
1787         if (peer) {
1788                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1789                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1791                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1792                         peer->tcp_ts = tp->rx_opt.ts_recent;
1793                 }
1794                 if (release_it)
1795                         inet_putpeer(peer);
1796                 return 1;
1797         }
1798
1799         return 0;
1800 }
1801 EXPORT_SYMBOL(tcp_v4_remember_stamp);
1802
1803 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1804 {
1805         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1806
1807         if (peer) {
1808                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1809
1810                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1811                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1812                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1813                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1814                         peer->tcp_ts       = tcptw->tw_ts_recent;
1815                 }
1816                 inet_putpeer(peer);
1817                 return 1;
1818         }
1819
1820         return 0;
1821 }
1822
1823 const struct inet_connection_sock_af_ops ipv4_specific = {
1824         .queue_xmit        = ip_queue_xmit,
1825         .send_check        = tcp_v4_send_check,
1826         .rebuild_header    = inet_sk_rebuild_header,
1827         .conn_request      = tcp_v4_conn_request,
1828         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1829         .remember_stamp    = tcp_v4_remember_stamp,
1830         .net_header_len    = sizeof(struct iphdr),
1831         .setsockopt        = ip_setsockopt,
1832         .getsockopt        = ip_getsockopt,
1833         .addr2sockaddr     = inet_csk_addr2sockaddr,
1834         .sockaddr_len      = sizeof(struct sockaddr_in),
1835         .bind_conflict     = inet_csk_bind_conflict,
1836 #ifdef CONFIG_COMPAT
1837         .compat_setsockopt = compat_ip_setsockopt,
1838         .compat_getsockopt = compat_ip_getsockopt,
1839 #endif
1840 };
1841 EXPORT_SYMBOL(ipv4_specific);
1842
1843 #ifdef CONFIG_TCP_MD5SIG
1844 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1845         .md5_lookup             = tcp_v4_md5_lookup,
1846         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1847         .md5_add                = tcp_v4_md5_add_func,
1848         .md5_parse              = tcp_v4_parse_md5_keys,
1849 };
1850 #endif
1851
1852 /* NOTE: A lot of things set to zero explicitly by call to
1853  *       sk_alloc() so need not be done here.
1854  */
1855 static int tcp_v4_init_sock(struct sock *sk)
1856 {
1857         struct inet_connection_sock *icsk = inet_csk(sk);
1858         struct tcp_sock *tp = tcp_sk(sk);
1859
1860         skb_queue_head_init(&tp->out_of_order_queue);
1861         tcp_init_xmit_timers(sk);
1862         tcp_prequeue_init(tp);
1863
1864         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1865         tp->mdev = TCP_TIMEOUT_INIT;
1866
1867         /* So many TCP implementations out there (incorrectly) count the
1868          * initial SYN frame in their delayed-ACK and congestion control
1869          * algorithms that we must have the following bandaid to talk
1870          * efficiently to them.  -DaveM
1871          */
1872         tp->snd_cwnd = 2;
1873
1874         /* See draft-stevens-tcpca-spec-01 for discussion of the
1875          * initialization of these values.
1876          */
1877         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1878         tp->snd_cwnd_clamp = ~0;
1879         tp->mss_cache = TCP_MSS_DEFAULT;
1880
1881         tp->reordering = sysctl_tcp_reordering;
1882         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1883
1884         sk->sk_state = TCP_CLOSE;
1885
1886         sk->sk_write_space = sk_stream_write_space;
1887         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1888
1889         icsk->icsk_af_ops = &ipv4_specific;
1890         icsk->icsk_sync_mss = tcp_sync_mss;
1891 #ifdef CONFIG_TCP_MD5SIG
1892         tp->af_specific = &tcp_sock_ipv4_specific;
1893 #endif
1894
1895         /* TCP Cookie Transactions */
1896         if (sysctl_tcp_cookie_size > 0) {
1897                 /* Default, cookies without s_data_payload. */
1898                 tp->cookie_values =
1899                         kzalloc(sizeof(*tp->cookie_values),
1900                                 sk->sk_allocation);
1901                 if (tp->cookie_values != NULL)
1902                         kref_init(&tp->cookie_values->kref);
1903         }
1904         /* Presumed zeroed, in order of appearance:
1905          *      cookie_in_always, cookie_out_never,
1906          *      s_data_constant, s_data_in, s_data_out
1907          */
1908         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1909         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1910
1911         local_bh_disable();
1912         percpu_counter_inc(&tcp_sockets_allocated);
1913         local_bh_enable();
1914
1915         return 0;
1916 }
1917
1918 void tcp_v4_destroy_sock(struct sock *sk)
1919 {
1920         struct tcp_sock *tp = tcp_sk(sk);
1921
1922         tcp_clear_xmit_timers(sk);
1923
1924         tcp_cleanup_congestion_control(sk);
1925
1926         /* Cleanup up the write buffer. */
1927         tcp_write_queue_purge(sk);
1928
1929         /* Cleans up our, hopefully empty, out_of_order_queue. */
1930         __skb_queue_purge(&tp->out_of_order_queue);
1931
1932 #ifdef CONFIG_TCP_MD5SIG
1933         /* Clean up the MD5 key list, if any */
1934         if (tp->md5sig_info) {
1935                 tcp_v4_clear_md5_list(sk);
1936                 kfree(tp->md5sig_info);
1937                 tp->md5sig_info = NULL;
1938         }
1939 #endif
1940
1941 #ifdef CONFIG_NET_DMA
1942         /* Cleans up our sk_async_wait_queue */
1943         __skb_queue_purge(&sk->sk_async_wait_queue);
1944 #endif
1945
1946         /* Clean prequeue, it must be empty really */
1947         __skb_queue_purge(&tp->ucopy.prequeue);
1948
1949         /* Clean up a referenced TCP bind bucket. */
1950         if (inet_csk(sk)->icsk_bind_hash)
1951                 inet_put_port(sk);
1952
1953         /*
1954          * If sendmsg cached page exists, toss it.
1955          */
1956         if (sk->sk_sndmsg_page) {
1957                 __free_page(sk->sk_sndmsg_page);
1958                 sk->sk_sndmsg_page = NULL;
1959         }
1960
1961         /* TCP Cookie Transactions */
1962         if (tp->cookie_values != NULL) {
1963                 kref_put(&tp->cookie_values->kref,
1964                          tcp_cookie_values_release);
1965                 tp->cookie_values = NULL;
1966         }
1967
1968         percpu_counter_dec(&tcp_sockets_allocated);
1969 }
1970 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1971
1972 #ifdef CONFIG_PROC_FS
1973 /* Proc filesystem TCP sock list dumping. */
1974
1975 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1976 {
1977         return hlist_nulls_empty(head) ? NULL :
1978                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1979 }
1980
1981 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1982 {
1983         return !is_a_nulls(tw->tw_node.next) ?
1984                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1985 }
1986
1987 /*
1988  * Get next listener socket follow cur.  If cur is NULL, get first socket
1989  * starting from bucket given in st->bucket; when st->bucket is zero the
1990  * very first socket in the hash table is returned.
1991  */
1992 static void *listening_get_next(struct seq_file *seq, void *cur)
1993 {
1994         struct inet_connection_sock *icsk;
1995         struct hlist_nulls_node *node;
1996         struct sock *sk = cur;
1997         struct inet_listen_hashbucket *ilb;
1998         struct tcp_iter_state *st = seq->private;
1999         struct net *net = seq_file_net(seq);
2000
2001         if (!sk) {
2002                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2003                 spin_lock_bh(&ilb->lock);
2004                 sk = sk_nulls_head(&ilb->head);
2005                 st->offset = 0;
2006                 goto get_sk;
2007         }
2008         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2009         ++st->num;
2010         ++st->offset;
2011
2012         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2013                 struct request_sock *req = cur;
2014
2015                 icsk = inet_csk(st->syn_wait_sk);
2016                 req = req->dl_next;
2017                 while (1) {
2018                         while (req) {
2019                                 if (req->rsk_ops->family == st->family) {
2020                                         cur = req;
2021                                         goto out;
2022                                 }
2023                                 req = req->dl_next;
2024                         }
2025                         st->offset = 0;
2026                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2027                                 break;
2028 get_req:
2029                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2030                 }
2031                 sk        = sk_next(st->syn_wait_sk);
2032                 st->state = TCP_SEQ_STATE_LISTENING;
2033                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034         } else {
2035                 icsk = inet_csk(sk);
2036                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2037                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2038                         goto start_req;
2039                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2040                 sk = sk_next(sk);
2041         }
2042 get_sk:
2043         sk_nulls_for_each_from(sk, node) {
2044                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2045                         cur = sk;
2046                         goto out;
2047                 }
2048                 icsk = inet_csk(sk);
2049                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2051 start_req:
2052                         st->uid         = sock_i_uid(sk);
2053                         st->syn_wait_sk = sk;
2054                         st->state       = TCP_SEQ_STATE_OPENREQ;
2055                         st->sbucket     = 0;
2056                         goto get_req;
2057                 }
2058                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059         }
2060         spin_unlock_bh(&ilb->lock);
2061         st->offset = 0;
2062         if (++st->bucket < INET_LHTABLE_SIZE) {
2063                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2064                 spin_lock_bh(&ilb->lock);
2065                 sk = sk_nulls_head(&ilb->head);
2066                 goto get_sk;
2067         }
2068         cur = NULL;
2069 out:
2070         return cur;
2071 }
2072
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc;
2077
2078         st->bucket = 0;
2079         st->offset = 0;
2080         rc = listening_get_next(seq, NULL);
2081
2082         while (rc && *pos) {
2083                 rc = listening_get_next(seq, rc);
2084                 --*pos;
2085         }
2086         return rc;
2087 }
2088
2089 static inline int empty_bucket(struct tcp_iter_state *st)
2090 {
2091         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2092                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2093 }
2094
2095 /*
2096  * Get first established socket starting from bucket given in st->bucket.
2097  * If st->bucket is zero, the very first socket in the hash is returned.
2098  */
2099 static void *established_get_first(struct seq_file *seq)
2100 {
2101         struct tcp_iter_state *st = seq->private;
2102         struct net *net = seq_file_net(seq);
2103         void *rc = NULL;
2104
2105         st->offset = 0;
2106         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2107                 struct sock *sk;
2108                 struct hlist_nulls_node *node;
2109                 struct inet_timewait_sock *tw;
2110                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2111
2112                 /* Lockless fast path for the common case of empty buckets */
2113                 if (empty_bucket(st))
2114                         continue;
2115
2116                 spin_lock_bh(lock);
2117                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2118                         if (sk->sk_family != st->family ||
2119                             !net_eq(sock_net(sk), net)) {
2120                                 continue;
2121                         }
2122                         rc = sk;
2123                         goto out;
2124                 }
2125                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2126                 inet_twsk_for_each(tw, node,
2127                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2128                         if (tw->tw_family != st->family ||
2129                             !net_eq(twsk_net(tw), net)) {
2130                                 continue;
2131                         }
2132                         rc = tw;
2133                         goto out;
2134                 }
2135                 spin_unlock_bh(lock);
2136                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137         }
2138 out:
2139         return rc;
2140 }
2141
2142 static void *established_get_next(struct seq_file *seq, void *cur)
2143 {
2144         struct sock *sk = cur;
2145         struct inet_timewait_sock *tw;
2146         struct hlist_nulls_node *node;
2147         struct tcp_iter_state *st = seq->private;
2148         struct net *net = seq_file_net(seq);
2149
2150         ++st->num;
2151         ++st->offset;
2152
2153         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2154                 tw = cur;
2155                 tw = tw_next(tw);
2156 get_tw:
2157                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2158                         tw = tw_next(tw);
2159                 }
2160                 if (tw) {
2161                         cur = tw;
2162                         goto out;
2163                 }
2164                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2166
2167                 /* Look for next non empty bucket */
2168                 st->offset = 0;
2169                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2170                                 empty_bucket(st))
2171                         ;
2172                 if (st->bucket > tcp_hashinfo.ehash_mask)
2173                         return NULL;
2174
2175                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2176                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2177         } else
2178                 sk = sk_nulls_next(sk);
2179
2180         sk_nulls_for_each_from(sk, node) {
2181                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2182                         goto found;
2183         }
2184
2185         st->state = TCP_SEQ_STATE_TIME_WAIT;
2186         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2187         goto get_tw;
2188 found:
2189         cur = sk;
2190 out:
2191         return cur;
2192 }
2193
2194 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196         struct tcp_iter_state *st = seq->private;
2197         void *rc;
2198
2199         st->bucket = 0;
2200         rc = established_get_first(seq);
2201
2202         while (rc && pos) {
2203                 rc = established_get_next(seq, rc);
2204                 --pos;
2205         }
2206         return rc;
2207 }
2208
2209 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2210 {
2211         void *rc;
2212         struct tcp_iter_state *st = seq->private;
2213
2214         st->state = TCP_SEQ_STATE_LISTENING;
2215         rc        = listening_get_idx(seq, &pos);
2216
2217         if (!rc) {
2218                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2219                 rc        = established_get_idx(seq, pos);
2220         }
2221
2222         return rc;
2223 }
2224
2225 static void *tcp_seek_last_pos(struct seq_file *seq)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         int offset = st->offset;
2229         int orig_num = st->num;
2230         void *rc = NULL;
2231
2232         switch (st->state) {
2233         case TCP_SEQ_STATE_OPENREQ:
2234         case TCP_SEQ_STATE_LISTENING:
2235                 if (st->bucket >= INET_LHTABLE_SIZE)
2236                         break;
2237                 st->state = TCP_SEQ_STATE_LISTENING;
2238                 rc = listening_get_next(seq, NULL);
2239                 while (offset-- && rc)
2240                         rc = listening_get_next(seq, rc);
2241                 if (rc)
2242                         break;
2243                 st->bucket = 0;
2244                 /* Fallthrough */
2245         case TCP_SEQ_STATE_ESTABLISHED:
2246         case TCP_SEQ_STATE_TIME_WAIT:
2247                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2248                 if (st->bucket > tcp_hashinfo.ehash_mask)
2249                         break;
2250                 rc = established_get_first(seq);
2251                 while (offset-- && rc)
2252                         rc = established_get_next(seq, rc);
2253         }
2254
2255         st->num = orig_num;
2256
2257         return rc;
2258 }
2259
2260 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2261 {
2262         struct tcp_iter_state *st = seq->private;
2263         void *rc;
2264
2265         if (*pos && *pos == st->last_pos) {
2266                 rc = tcp_seek_last_pos(seq);
2267                 if (rc)
2268                         goto out;
2269         }
2270
2271         st->state = TCP_SEQ_STATE_LISTENING;
2272         st->num = 0;
2273         st->bucket = 0;
2274         st->offset = 0;
2275         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276
2277 out:
2278         st->last_pos = *pos;
2279         return rc;
2280 }
2281
2282 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2283 {
2284         struct tcp_iter_state *st = seq->private;
2285         void *rc = NULL;
2286
2287         if (v == SEQ_START_TOKEN) {
2288                 rc = tcp_get_idx(seq, 0);
2289                 goto out;
2290         }
2291
2292         switch (st->state) {
2293         case TCP_SEQ_STATE_OPENREQ:
2294         case TCP_SEQ_STATE_LISTENING:
2295                 rc = listening_get_next(seq, v);
2296                 if (!rc) {
2297                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2298                         st->bucket = 0;
2299                         st->offset = 0;
2300                         rc        = established_get_first(seq);
2301                 }
2302                 break;
2303         case TCP_SEQ_STATE_ESTABLISHED:
2304         case TCP_SEQ_STATE_TIME_WAIT:
2305                 rc = established_get_next(seq, v);
2306                 break;
2307         }
2308 out:
2309         ++*pos;
2310         st->last_pos = *pos;
2311         return rc;
2312 }
2313
2314 static void tcp_seq_stop(struct seq_file *seq, void *v)
2315 {
2316         struct tcp_iter_state *st = seq->private;
2317
2318         switch (st->state) {
2319         case TCP_SEQ_STATE_OPENREQ:
2320                 if (v) {
2321                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2322                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2323                 }
2324         case TCP_SEQ_STATE_LISTENING:
2325                 if (v != SEQ_START_TOKEN)
2326                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2327                 break;
2328         case TCP_SEQ_STATE_TIME_WAIT:
2329         case TCP_SEQ_STATE_ESTABLISHED:
2330                 if (v)
2331                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2332                 break;
2333         }
2334 }
2335
2336 static int tcp_seq_open(struct inode *inode, struct file *file)
2337 {
2338         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2339         struct tcp_iter_state *s;
2340         int err;
2341
2342         err = seq_open_net(inode, file, &afinfo->seq_ops,
2343                           sizeof(struct tcp_iter_state));
2344         if (err < 0)
2345                 return err;
2346
2347         s = ((struct seq_file *)file->private_data)->private;
2348         s->family               = afinfo->family;
2349         s->last_pos             = 0;
2350         return 0;
2351 }
2352
2353 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2354 {
2355         int rc = 0;
2356         struct proc_dir_entry *p;
2357
2358         afinfo->seq_fops.open           = tcp_seq_open;
2359         afinfo->seq_fops.read           = seq_read;
2360         afinfo->seq_fops.llseek         = seq_lseek;
2361         afinfo->seq_fops.release        = seq_release_net;
2362
2363         afinfo->seq_ops.start           = tcp_seq_start;
2364         afinfo->seq_ops.next            = tcp_seq_next;
2365         afinfo->seq_ops.stop            = tcp_seq_stop;
2366
2367         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2368                              &afinfo->seq_fops, afinfo);
2369         if (!p)
2370                 rc = -ENOMEM;
2371         return rc;
2372 }
2373 EXPORT_SYMBOL(tcp_proc_register);
2374
2375 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377         proc_net_remove(net, afinfo->name);
2378 }
2379 EXPORT_SYMBOL(tcp_proc_unregister);
2380
2381 static void get_openreq4(struct sock *sk, struct request_sock *req,
2382                          struct seq_file *f, int i, int uid, int *len)
2383 {
2384         const struct inet_request_sock *ireq = inet_rsk(req);
2385         int ttd = req->expires - jiffies;
2386
2387         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2389                 i,
2390                 ireq->loc_addr,
2391                 ntohs(inet_sk(sk)->inet_sport),
2392                 ireq->rmt_addr,
2393                 ntohs(ireq->rmt_port),
2394                 TCP_SYN_RECV,
2395                 0, 0, /* could print option size, but that is af dependent. */
2396                 1,    /* timers active (only the expire timer) */
2397                 jiffies_to_clock_t(ttd),
2398                 req->retrans,
2399                 uid,
2400                 0,  /* non standard timer */
2401                 0, /* open_requests have no inode */
2402                 atomic_read(&sk->sk_refcnt),
2403                 req,
2404                 len);
2405 }
2406
2407 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2408 {
2409         int timer_active;
2410         unsigned long timer_expires;
2411         struct tcp_sock *tp = tcp_sk(sk);
2412         const struct inet_connection_sock *icsk = inet_csk(sk);
2413         struct inet_sock *inet = inet_sk(sk);
2414         __be32 dest = inet->inet_daddr;
2415         __be32 src = inet->inet_rcv_saddr;
2416         __u16 destp = ntohs(inet->inet_dport);
2417         __u16 srcp = ntohs(inet->inet_sport);
2418         int rx_queue;
2419
2420         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2421                 timer_active    = 1;
2422                 timer_expires   = icsk->icsk_timeout;
2423         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2424                 timer_active    = 4;
2425                 timer_expires   = icsk->icsk_timeout;
2426         } else if (timer_pending(&sk->sk_timer)) {
2427                 timer_active    = 2;
2428                 timer_expires   = sk->sk_timer.expires;
2429         } else {
2430                 timer_active    = 0;
2431                 timer_expires = jiffies;
2432         }
2433
2434         if (sk->sk_state == TCP_LISTEN)
2435                 rx_queue = sk->sk_ack_backlog;
2436         else
2437                 /*
2438                  * because we dont lock socket, we might find a transient negative value
2439                  */
2440                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441
2442         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2444                 i, src, srcp, dest, destp, sk->sk_state,
2445                 tp->write_seq - tp->snd_una,
2446                 rx_queue,
2447                 timer_active,
2448                 jiffies_to_clock_t(timer_expires - jiffies),
2449                 icsk->icsk_retransmits,
2450                 sock_i_uid(sk),
2451                 icsk->icsk_probes_out,
2452                 sock_i_ino(sk),
2453                 atomic_read(&sk->sk_refcnt), sk,
2454                 jiffies_to_clock_t(icsk->icsk_rto),
2455                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2456                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2457                 tp->snd_cwnd,
2458                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2459                 len);
2460 }
2461
2462 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2463                                struct seq_file *f, int i, int *len)
2464 {
2465         __be32 dest, src;
2466         __u16 destp, srcp;
2467         int ttd = tw->tw_ttd - jiffies;
2468
2469         if (ttd < 0)
2470                 ttd = 0;
2471
2472         dest  = tw->tw_daddr;
2473         src   = tw->tw_rcv_saddr;
2474         destp = ntohs(tw->tw_dport);
2475         srcp  = ntohs(tw->tw_sport);
2476
2477         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2479                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481                 atomic_read(&tw->tw_refcnt), tw, len);
2482 }
2483
2484 #define TMPSZ 150
2485
2486 static int tcp4_seq_show(struct seq_file *seq, void *v)
2487 {
2488         struct tcp_iter_state *st;
2489         int len;
2490
2491         if (v == SEQ_START_TOKEN) {
2492                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2493                            "  sl  local_address rem_address   st tx_queue "
2494                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2495                            "inode");
2496                 goto out;
2497         }
2498         st = seq->private;
2499
2500         switch (st->state) {
2501         case TCP_SEQ_STATE_LISTENING:
2502         case TCP_SEQ_STATE_ESTABLISHED:
2503                 get_tcp4_sock(v, seq, st->num, &len);
2504                 break;
2505         case TCP_SEQ_STATE_OPENREQ:
2506                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2507                 break;
2508         case TCP_SEQ_STATE_TIME_WAIT:
2509                 get_timewait4_sock(v, seq, st->num, &len);
2510                 break;
2511         }
2512         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2513 out:
2514         return 0;
2515 }
2516
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518         .name           = "tcp",
2519         .family         = AF_INET,
2520         .seq_fops       = {
2521                 .owner          = THIS_MODULE,
2522         },
2523         .seq_ops        = {
2524                 .show           = tcp4_seq_show,
2525         },
2526 };
2527
2528 static int __net_init tcp4_proc_init_net(struct net *net)
2529 {
2530         return tcp_proc_register(net, &tcp4_seq_afinfo);
2531 }
2532
2533 static void __net_exit tcp4_proc_exit_net(struct net *net)
2534 {
2535         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2536 }
2537
2538 static struct pernet_operations tcp4_net_ops = {
2539         .init = tcp4_proc_init_net,
2540         .exit = tcp4_proc_exit_net,
2541 };
2542
2543 int __init tcp4_proc_init(void)
2544 {
2545         return register_pernet_subsys(&tcp4_net_ops);
2546 }
2547
2548 void tcp4_proc_exit(void)
2549 {
2550         unregister_pernet_subsys(&tcp4_net_ops);
2551 }
2552 #endif /* CONFIG_PROC_FS */
2553
2554 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555 {
2556         struct iphdr *iph = skb_gro_network_header(skb);
2557
2558         switch (skb->ip_summed) {
2559         case CHECKSUM_COMPLETE:
2560                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2561                                   skb->csum)) {
2562                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2563                         break;
2564                 }
2565
2566                 /* fall through */
2567         case CHECKSUM_NONE:
2568                 NAPI_GRO_CB(skb)->flush = 1;
2569                 return NULL;
2570         }
2571
2572         return tcp_gro_receive(head, skb);
2573 }
2574 EXPORT_SYMBOL(tcp4_gro_receive);
2575
2576 int tcp4_gro_complete(struct sk_buff *skb)
2577 {
2578         struct iphdr *iph = ip_hdr(skb);
2579         struct tcphdr *th = tcp_hdr(skb);
2580
2581         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2582                                   iph->saddr, iph->daddr, 0);
2583         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2584
2585         return tcp_gro_complete(skb);
2586 }
2587 EXPORT_SYMBOL(tcp4_gro_complete);
2588
2589 struct proto tcp_prot = {
2590         .name                   = "TCP",
2591         .owner                  = THIS_MODULE,
2592         .close                  = tcp_close,
2593         .connect                = tcp_v4_connect,
2594         .disconnect             = tcp_disconnect,
2595         .accept                 = inet_csk_accept,
2596         .ioctl                  = tcp_ioctl,
2597         .init                   = tcp_v4_init_sock,
2598         .destroy                = tcp_v4_destroy_sock,
2599         .shutdown               = tcp_shutdown,
2600         .setsockopt             = tcp_setsockopt,
2601         .getsockopt             = tcp_getsockopt,
2602         .recvmsg                = tcp_recvmsg,
2603         .sendmsg                = tcp_sendmsg,
2604         .sendpage               = tcp_sendpage,
2605         .backlog_rcv            = tcp_v4_do_rcv,
2606         .hash                   = inet_hash,
2607         .unhash                 = inet_unhash,
2608         .get_port               = inet_csk_get_port,
2609         .enter_memory_pressure  = tcp_enter_memory_pressure,
2610         .sockets_allocated      = &tcp_sockets_allocated,
2611         .orphan_count           = &tcp_orphan_count,
2612         .memory_allocated       = &tcp_memory_allocated,
2613         .memory_pressure        = &tcp_memory_pressure,
2614         .sysctl_mem             = sysctl_tcp_mem,
2615         .sysctl_wmem            = sysctl_tcp_wmem,
2616         .sysctl_rmem            = sysctl_tcp_rmem,
2617         .max_header             = MAX_TCP_HEADER,
2618         .obj_size               = sizeof(struct tcp_sock),
2619         .slab_flags             = SLAB_DESTROY_BY_RCU,
2620         .twsk_prot              = &tcp_timewait_sock_ops,
2621         .rsk_prot               = &tcp_request_sock_ops,
2622         .h.hashinfo             = &tcp_hashinfo,
2623         .no_autobind            = true,
2624 #ifdef CONFIG_COMPAT
2625         .compat_setsockopt      = compat_tcp_setsockopt,
2626         .compat_getsockopt      = compat_tcp_getsockopt,
2627 #endif
2628 };
2629 EXPORT_SYMBOL(tcp_prot);
2630
2631
2632 static int __net_init tcp_sk_init(struct net *net)
2633 {
2634         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2635                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2636 }
2637
2638 static void __net_exit tcp_sk_exit(struct net *net)
2639 {
2640         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2641 }
2642
2643 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2644 {
2645         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2646 }
2647
2648 static struct pernet_operations __net_initdata tcp_sk_ops = {
2649        .init       = tcp_sk_init,
2650        .exit       = tcp_sk_exit,
2651        .exit_batch = tcp_sk_exit_batch,
2652 };
2653
2654 void __init tcp_v4_init(void)
2655 {
2656         inet_hashinfo_init(&tcp_hashinfo);
2657         if (register_pernet_subsys(&tcp_sk_ops))
2658                 panic("Failed to create the TCP control socket.\n");
2659 }