net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 434                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653
 654         net = dev_net(skb_dst(skb)->dev);
 655         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 656                       &arg, arg.iov[0].iov_len);
 657
 658         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 659         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 660 }
 661
 662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 663    outside socket context is ugly, certainly. What can I do?
 664  */
 665
 666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 667                             u32 win, u32 ts, int oif,
 668                             struct tcp_md5sig_key *key,
 669                             int reply_flags)
 670 {
 671         struct tcphdr *th = tcp_hdr(skb);
 672         struct {
 673                 struct tcphdr th;
 674                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 675 #ifdef CONFIG_TCP_MD5SIG
 676                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 677 #endif
 678                         ];
 679         } rep;
 680         struct ip_reply_arg arg;
 681         struct net *net = dev_net(skb_dst(skb)->dev);
 682
 683         memset(&rep.th, 0, sizeof(struct tcphdr));
 684         memset(&arg, 0, sizeof(arg));
 685
 686         arg.iov[0].iov_base = (unsigned char *)&rep;
 687         arg.iov[0].iov_len  = sizeof(rep.th);
 688         if (ts) {
 689                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 690                                    (TCPOPT_TIMESTAMP << 8) |
 691                                    TCPOLEN_TIMESTAMP);
 692                 rep.opt[1] = htonl(tcp_time_stamp);
 693                 rep.opt[2] = htonl(ts);
 694                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 695         }
 696
 697         /* Swap the send and the receive. */
 698         rep.th.dest    = th->source;
 699         rep.th.source  = th->dest;
 700         rep.th.doff    = arg.iov[0].iov_len / 4;
 701         rep.th.seq     = htonl(seq);
 702         rep.th.ack_seq = htonl(ack);
 703         rep.th.ack     = 1;
 704         rep.th.window  = htons(win);
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707         if (key) {
 708                 int offset = (ts) ? 3 : 0;
 709
 710                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 711                                           (TCPOPT_NOP << 16) |
 712                                           (TCPOPT_MD5SIG << 8) |
 713                                           TCPOLEN_MD5SIG);
 714                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 715                 rep.th.doff = arg.iov[0].iov_len/4;
 716
 717                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 718                                     key, ip_hdr(skb)->saddr,
 719                                     ip_hdr(skb)->daddr, &rep.th);
 720         }
 721 #endif
 722         arg.flags = reply_flags;
 723         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 724                                       ip_hdr(skb)->saddr, /* XXX */
 725                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 726         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 727         if (oif)
 728                 arg.bound_dev_if = oif;
 729
 730         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 731                       &arg, arg.iov[0].iov_len);
 732
 733         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 734 }
 735
 736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 737 {
 738         struct inet_timewait_sock *tw = inet_twsk(sk);
 739         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 740
 741         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 742                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 743                         tcptw->tw_ts_recent,
 744                         tw->tw_bound_dev_if,
 745                         tcp_twsk_md5_key(tcptw),
 746                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 747                         );
 748
 749         inet_twsk_put(tw);
 750 }
 751
 752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 753                                   struct request_sock *req)
 754 {
 755         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 756                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 757                         req->ts_recent,
 758                         0,
 759                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 760                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 761 }
 762
 763 /*
 764  *      Send a SYN-ACK after having received a SYN.
 765  *      This still operates on a request_sock only, not on a big
 766  *      socket.
 767  */
 768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 769                               struct request_sock *req,
 770                               struct request_values *rvp)
 771 {
 772         const struct inet_request_sock *ireq = inet_rsk(req);
 773         struct flowi4 fl4;
 774         int err = -1;
 775         struct sk_buff * skb;
 776
 777         /* First, grab a route. */
 778         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 779                 return -1;
 780
 781         skb = tcp_make_synack(sk, dst, req, rvp);
 782
 783         if (skb) {
 784                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 785
 786                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 787                                             ireq->rmt_addr,
 788                                             ireq->opt);
 789                 err = net_xmit_eval(err);
 790         }
 791
 792         dst_release(dst);
 793         return err;
 794 }
 795
 796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 797                               struct request_values *rvp)
 798 {
 799         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 800         return tcp_v4_send_synack(sk, NULL, req, rvp);
 801 }
 802
 803 /*
 804  *      IPv4 request_sock destructor.
 805  */
 806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 807 {
 808         kfree(inet_rsk(req)->opt);
 809 }
 810
 811 /*
 812  * Return 1 if a syncookie should be sent
 813  */
 814 int tcp_syn_flood_action(struct sock *sk,
 815                          const struct sk_buff *skb,
 816                          const char *proto)
 817 {
 818         const char *msg = "Dropping request";
 819         int want_cookie = 0;
 820         struct listen_sock *lopt;
 821
 822
 823
 824 #ifdef CONFIG_SYN_COOKIES
 825         if (sysctl_tcp_syncookies) {
 826                 msg = "Sending cookies";
 827                 want_cookie = 1;
 828                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 829         } else
 830 #endif
 831                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 832
 833         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 834         if (!lopt->synflood_warned) {
 835                 lopt->synflood_warned = 1;
 836                 pr_info("%s: Possible SYN flooding on port %d. %s. "
 837                         " Check SNMP counters.\n",
 838                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 839         }
 840         return want_cookie;
 841 }
 842 EXPORT_SYMBOL(tcp_syn_flood_action);
 843
 844 /*
 845  * Save and compile IPv4 options into the request_sock if needed.
 846  */
 847 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 848                                                   struct sk_buff *skb)
 849 {
 850         const struct ip_options *opt = &(IPCB(skb)->opt);
 851         struct ip_options_rcu *dopt = NULL;
 852
 853         if (opt && opt->optlen) {
 854                 int opt_size = sizeof(*dopt) + opt->optlen;
 855
 856                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 857                 if (dopt) {
 858                         if (ip_options_echo(&dopt->opt, skb)) {
 859                                 kfree(dopt);
 860                                 dopt = NULL;
 861                         }
 862                 }
 863         }
 864         return dopt;
 865 }
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868 /*
 869  * RFC2385 MD5 checksumming requires a mapping of
 870  * IP address->MD5 Key.
 871  * We need to maintain these in the sk structure.
 872  */
 873
 874 /* Find the Key structure for an address.  */
 875 static struct tcp_md5sig_key *
 876                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 877 {
 878         struct tcp_sock *tp = tcp_sk(sk);
 879         int i;
 880
 881         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 882                 return NULL;
 883         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 884                 if (tp->md5sig_info->keys4[i].addr == addr)
 885                         return &tp->md5sig_info->keys4[i].base;
 886         }
 887         return NULL;
 888 }
 889
 890 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 891                                          struct sock *addr_sk)
 892 {
 893         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 894 }
 895 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 896
 897 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 898                                                       struct request_sock *req)
 899 {
 900         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 901 }
 902
 903 /* This can be called on a newly created socket, from other files */
 904 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 905                       u8 *newkey, u8 newkeylen)
 906 {
 907         /* Add Key to the list */
 908         struct tcp_md5sig_key *key;
 909         struct tcp_sock *tp = tcp_sk(sk);
 910         struct tcp4_md5sig_key *keys;
 911
 912         key = tcp_v4_md5_do_lookup(sk, addr);
 913         if (key) {
 914                 /* Pre-existing entry - just update that one. */
 915                 kfree(key->key);
 916                 key->key = newkey;
 917                 key->keylen = newkeylen;
 918         } else {
 919                 struct tcp_md5sig_info *md5sig;
 920
 921                 if (!tp->md5sig_info) {
 922                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 923                                                   GFP_ATOMIC);
 924                         if (!tp->md5sig_info) {
 925                                 kfree(newkey);
 926                                 return -ENOMEM;
 927                         }
 928                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 929                 }
 930                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 931                         kfree(newkey);
 932                         return -ENOMEM;
 933                 }
 934                 md5sig = tp->md5sig_info;
 935
 936                 if (md5sig->alloced4 == md5sig->entries4) {
 937                         keys = kmalloc((sizeof(*keys) *
 938                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 939                         if (!keys) {
 940                                 kfree(newkey);
 941                                 tcp_free_md5sig_pool();
 942                                 return -ENOMEM;
 943                         }
 944
 945                         if (md5sig->entries4)
 946                                 memcpy(keys, md5sig->keys4,
 947                                        sizeof(*keys) * md5sig->entries4);
 948
 949                         /* Free old key list, and reference new one */
 950                         kfree(md5sig->keys4);
 951                         md5sig->keys4 = keys;
 952                         md5sig->alloced4++;
 953                 }
 954                 md5sig->entries4++;
 955                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 956                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 957                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 958         }
 959         return 0;
 960 }
 961 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 962
 963 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 964                                u8 *newkey, u8 newkeylen)
 965 {
 966         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 967                                  newkey, newkeylen);
 968 }
 969
 970 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 971 {
 972         struct tcp_sock *tp = tcp_sk(sk);
 973         int i;
 974
 975         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 976                 if (tp->md5sig_info->keys4[i].addr == addr) {
 977                         /* Free the key */
 978                         kfree(tp->md5sig_info->keys4[i].base.key);
 979                         tp->md5sig_info->entries4--;
 980
 981                         if (tp->md5sig_info->entries4 == 0) {
 982                                 kfree(tp->md5sig_info->keys4);
 983                                 tp->md5sig_info->keys4 = NULL;
 984                                 tp->md5sig_info->alloced4 = 0;
 985                         } else if (tp->md5sig_info->entries4 != i) {
 986                                 /* Need to do some manipulation */
 987                                 memmove(&tp->md5sig_info->keys4[i],
 988                                         &tp->md5sig_info->keys4[i+1],
 989                                         (tp->md5sig_info->entries4 - i) *
 990                                          sizeof(struct tcp4_md5sig_key));
 991                         }
 992                         tcp_free_md5sig_pool();
 993                         return 0;
 994                 }
 995         }
 996         return -ENOENT;
 997 }
 998 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 999
1000 static void tcp_v4_clear_md5_list(struct sock *sk)
1001 {
1002         struct tcp_sock *tp = tcp_sk(sk);
1003
1004         /* Free each key, then the set of key keys,
1005          * the crypto element, and then decrement our
1006          * hold on the last resort crypto.
1007          */
1008         if (tp->md5sig_info->entries4) {
1009                 int i;
1010                 for (i = 0; i < tp->md5sig_info->entries4; i++)
1011                         kfree(tp->md5sig_info->keys4[i].base.key);
1012                 tp->md5sig_info->entries4 = 0;
1013                 tcp_free_md5sig_pool();
1014         }
1015         if (tp->md5sig_info->keys4) {
1016                 kfree(tp->md5sig_info->keys4);
1017                 tp->md5sig_info->keys4 = NULL;
1018                 tp->md5sig_info->alloced4  = 0;
1019         }
1020 }
1021
1022 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023                                  int optlen)
1024 {
1025         struct tcp_md5sig cmd;
1026         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027         u8 *newkey;
1028
1029         if (optlen < sizeof(cmd))
1030                 return -EINVAL;
1031
1032         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1033                 return -EFAULT;
1034
1035         if (sin->sin_family != AF_INET)
1036                 return -EINVAL;
1037
1038         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1039                 if (!tcp_sk(sk)->md5sig_info)
1040                         return -ENOENT;
1041                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1042         }
1043
1044         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1045                 return -EINVAL;
1046
1047         if (!tcp_sk(sk)->md5sig_info) {
1048                 struct tcp_sock *tp = tcp_sk(sk);
1049                 struct tcp_md5sig_info *p;
1050
1051                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1052                 if (!p)
1053                         return -EINVAL;
1054
1055                 tp->md5sig_info = p;
1056                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1057         }
1058
1059         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1060         if (!newkey)
1061                 return -ENOMEM;
1062         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1063                                  newkey, cmd.tcpm_keylen);
1064 }
1065
1066 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1067                                         __be32 daddr, __be32 saddr, int nbytes)
1068 {
1069         struct tcp4_pseudohdr *bp;
1070         struct scatterlist sg;
1071
1072         bp = &hp->md5_blk.ip4;
1073
1074         /*
1075          * 1. the TCP pseudo-header (in the order: source IP address,
1076          * destination IP address, zero-padded protocol number, and
1077          * segment length)
1078          */
1079         bp->saddr = saddr;
1080         bp->daddr = daddr;
1081         bp->pad = 0;
1082         bp->protocol = IPPROTO_TCP;
1083         bp->len = cpu_to_be16(nbytes);
1084
1085         sg_init_one(&sg, bp, sizeof(*bp));
1086         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1087 }
1088
1089 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1090                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1091 {
1092         struct tcp_md5sig_pool *hp;
1093         struct hash_desc *desc;
1094
1095         hp = tcp_get_md5sig_pool();
1096         if (!hp)
1097                 goto clear_hash_noput;
1098         desc = &hp->md5_desc;
1099
1100         if (crypto_hash_init(desc))
1101                 goto clear_hash;
1102         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1103                 goto clear_hash;
1104         if (tcp_md5_hash_header(hp, th))
1105                 goto clear_hash;
1106         if (tcp_md5_hash_key(hp, key))
1107                 goto clear_hash;
1108         if (crypto_hash_final(desc, md5_hash))
1109                 goto clear_hash;
1110
1111         tcp_put_md5sig_pool();
1112         return 0;
1113
1114 clear_hash:
1115         tcp_put_md5sig_pool();
1116 clear_hash_noput:
1117         memset(md5_hash, 0, 16);
1118         return 1;
1119 }
1120
1121 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1122                         struct sock *sk, struct request_sock *req,
1123                         struct sk_buff *skb)
1124 {
1125         struct tcp_md5sig_pool *hp;
1126         struct hash_desc *desc;
1127         struct tcphdr *th = tcp_hdr(skb);
1128         __be32 saddr, daddr;
1129
1130         if (sk) {
1131                 saddr = inet_sk(sk)->inet_saddr;
1132                 daddr = inet_sk(sk)->inet_daddr;
1133         } else if (req) {
1134                 saddr = inet_rsk(req)->loc_addr;
1135                 daddr = inet_rsk(req)->rmt_addr;
1136         } else {
1137                 const struct iphdr *iph = ip_hdr(skb);
1138                 saddr = iph->saddr;
1139                 daddr = iph->daddr;
1140         }
1141
1142         hp = tcp_get_md5sig_pool();
1143         if (!hp)
1144                 goto clear_hash_noput;
1145         desc = &hp->md5_desc;
1146
1147         if (crypto_hash_init(desc))
1148                 goto clear_hash;
1149
1150         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1151                 goto clear_hash;
1152         if (tcp_md5_hash_header(hp, th))
1153                 goto clear_hash;
1154         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1155                 goto clear_hash;
1156         if (tcp_md5_hash_key(hp, key))
1157                 goto clear_hash;
1158         if (crypto_hash_final(desc, md5_hash))
1159                 goto clear_hash;
1160
1161         tcp_put_md5sig_pool();
1162         return 0;
1163
1164 clear_hash:
1165         tcp_put_md5sig_pool();
1166 clear_hash_noput:
1167         memset(md5_hash, 0, 16);
1168         return 1;
1169 }
1170 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1171
1172 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1173 {
1174         /*
1175          * This gets called for each TCP segment that arrives
1176          * so we want to be efficient.
1177          * We have 3 drop cases:
1178          * o No MD5 hash and one expected.
1179          * o MD5 hash and we're not expecting one.
1180          * o MD5 hash and its wrong.
1181          */
1182         __u8 *hash_location = NULL;
1183         struct tcp_md5sig_key *hash_expected;
1184         const struct iphdr *iph = ip_hdr(skb);
1185         struct tcphdr *th = tcp_hdr(skb);
1186         int genhash;
1187         unsigned char newhash[16];
1188
1189         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1190         hash_location = tcp_parse_md5sig_option(th);
1191
1192         /* We've parsed the options - do we have a hash? */
1193         if (!hash_expected && !hash_location)
1194                 return 0;
1195
1196         if (hash_expected && !hash_location) {
1197                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1198                 return 1;
1199         }
1200
1201         if (!hash_expected && hash_location) {
1202                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1203                 return 1;
1204         }
1205
1206         /* Okay, so this is hash_expected and hash_location -
1207          * so we need to calculate the checksum.
1208          */
1209         genhash = tcp_v4_md5_hash_skb(newhash,
1210                                       hash_expected,
1211                                       NULL, NULL, skb);
1212
1213         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214                 if (net_ratelimit()) {
1215                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1216                                &iph->saddr, ntohs(th->source),
1217                                &iph->daddr, ntohs(th->dest),
1218                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219                 }
1220                 return 1;
1221         }
1222         return 0;
1223 }
1224
1225 #endif
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234         .syn_ack_timeout =      tcp_syn_ack_timeout,
1235 };
1236
1237 #ifdef CONFIG_TCP_MD5SIG
1238 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1240         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1241 };
1242 #endif
1243
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246         struct tcp_extend_values tmp_ext;
1247         struct tcp_options_received tmp_opt;
1248         u8 *hash_location;
1249         struct request_sock *req;
1250         struct inet_request_sock *ireq;
1251         struct tcp_sock *tp = tcp_sk(sk);
1252         struct dst_entry *dst = NULL;
1253         __be32 saddr = ip_hdr(skb)->saddr;
1254         __be32 daddr = ip_hdr(skb)->daddr;
1255         __u32 isn = TCP_SKB_CB(skb)->when;
1256         int want_cookie = 0;
1257
1258         /* Never answer to SYNs send to broadcast or multicast */
1259         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1260                 goto drop;
1261
1262         /* TW buckets are converted to open requests without
1263          * limitations, they conserve resources and peer is
1264          * evidently real one.
1265          */
1266         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1267                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1268                 if (!want_cookie)
1269                         goto drop;
1270         }
1271
1272         /* Accept backlog is full. If we have already queued enough
1273          * of warm entries in syn queue, drop request. It is better than
1274          * clogging syn queue with openreqs with exponentially increasing
1275          * timeout.
1276          */
1277         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1278                 goto drop;
1279
1280         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1281         if (!req)
1282                 goto drop;
1283
1284 #ifdef CONFIG_TCP_MD5SIG
1285         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1286 #endif
1287
1288         tcp_clear_options(&tmp_opt);
1289         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1290         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1291         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1292
1293         if (tmp_opt.cookie_plus > 0 &&
1294             tmp_opt.saw_tstamp &&
1295             !tp->rx_opt.cookie_out_never &&
1296             (sysctl_tcp_cookie_size > 0 ||
1297              (tp->cookie_values != NULL &&
1298               tp->cookie_values->cookie_desired > 0))) {
1299                 u8 *c;
1300                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1301                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1302
1303                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1304                         goto drop_and_release;
1305
1306                 /* Secret recipe starts with IP addresses */
1307                 *mess++ ^= (__force u32)daddr;
1308                 *mess++ ^= (__force u32)saddr;
1309
1310                 /* plus variable length Initiator Cookie */
1311                 c = (u8 *)mess;
1312                 while (l-- > 0)
1313                         *c++ ^= *hash_location++;
1314
1315                 want_cookie = 0;        /* not our kind of cookie */
1316                 tmp_ext.cookie_out_never = 0; /* false */
1317                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1318         } else if (!tp->rx_opt.cookie_in_always) {
1319                 /* redundant indications, but ensure initialization. */
1320                 tmp_ext.cookie_out_never = 1; /* true */
1321                 tmp_ext.cookie_plus = 0;
1322         } else {
1323                 goto drop_and_release;
1324         }
1325         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1326
1327         if (want_cookie && !tmp_opt.saw_tstamp)
1328                 tcp_clear_options(&tmp_opt);
1329
1330         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1331         tcp_openreq_init(req, &tmp_opt, skb);
1332
1333         ireq = inet_rsk(req);
1334         ireq->loc_addr = daddr;
1335         ireq->rmt_addr = saddr;
1336         ireq->no_srccheck = inet_sk(sk)->transparent;
1337         ireq->opt = tcp_v4_save_options(sk, skb);
1338
1339         if (security_inet_conn_request(sk, skb, req))
1340                 goto drop_and_free;
1341
1342         if (!want_cookie || tmp_opt.tstamp_ok)
1343                 TCP_ECN_create_request(req, tcp_hdr(skb));
1344
1345         if (want_cookie) {
1346                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1347                 req->cookie_ts = tmp_opt.tstamp_ok;
1348         } else if (!isn) {
1349                 struct inet_peer *peer = NULL;
1350                 struct flowi4 fl4;
1351
1352                 /* VJ's idea. We save last timestamp seen
1353                  * from the destination in peer table, when entering
1354                  * state TIME-WAIT, and check against it before
1355                  * accepting new connection request.
1356                  *
1357                  * If "isn" is not zero, this request hit alive
1358                  * timewait bucket, so that all the necessary checks
1359                  * are made in the function processing timewait state.
1360                  */
1361                 if (tmp_opt.saw_tstamp &&
1362                     tcp_death_row.sysctl_tw_recycle &&
1363                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1364                     fl4.daddr == saddr &&
1365                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1366                         inet_peer_refcheck(peer);
1367                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1368                             (s32)(peer->tcp_ts - req->ts_recent) >
1369                                                         TCP_PAWS_WINDOW) {
1370                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1371                                 goto drop_and_release;
1372                         }
1373                 }
1374                 /* Kill the following clause, if you dislike this way. */
1375                 else if (!sysctl_tcp_syncookies &&
1376                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1377                           (sysctl_max_syn_backlog >> 2)) &&
1378                          (!peer || !peer->tcp_ts_stamp) &&
1379                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1380                         /* Without syncookies last quarter of
1381                          * backlog is filled with destinations,
1382                          * proven to be alive.
1383                          * It means that we continue to communicate
1384                          * to destinations, already remembered
1385                          * to the moment of synflood.
1386                          */
1387                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1388                                        &saddr, ntohs(tcp_hdr(skb)->source));
1389                         goto drop_and_release;
1390                 }
1391
1392                 isn = tcp_v4_init_sequence(skb);
1393         }
1394         tcp_rsk(req)->snt_isn = isn;
1395         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1396
1397         if (tcp_v4_send_synack(sk, dst, req,
1398                                (struct request_values *)&tmp_ext) ||
1399             want_cookie)
1400                 goto drop_and_free;
1401
1402         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1403         return 0;
1404
1405 drop_and_release:
1406         dst_release(dst);
1407 drop_and_free:
1408         reqsk_free(req);
1409 drop:
1410         return 0;
1411 }
1412 EXPORT_SYMBOL(tcp_v4_conn_request);
1413
1414
1415 /*
1416  * The three way handshake has completed - we got a valid synack -
1417  * now create the new socket.
1418  */
1419 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1420                                   struct request_sock *req,
1421                                   struct dst_entry *dst)
1422 {
1423         struct inet_request_sock *ireq;
1424         struct inet_sock *newinet;
1425         struct tcp_sock *newtp;
1426         struct sock *newsk;
1427 #ifdef CONFIG_TCP_MD5SIG
1428         struct tcp_md5sig_key *key;
1429 #endif
1430         struct ip_options_rcu *inet_opt;
1431
1432         if (sk_acceptq_is_full(sk))
1433                 goto exit_overflow;
1434
1435         newsk = tcp_create_openreq_child(sk, req, skb);
1436         if (!newsk)
1437                 goto exit_nonewsk;
1438
1439         newsk->sk_gso_type = SKB_GSO_TCPV4;
1440
1441         newtp                 = tcp_sk(newsk);
1442         newinet               = inet_sk(newsk);
1443         ireq                  = inet_rsk(req);
1444         newinet->inet_daddr   = ireq->rmt_addr;
1445         newinet->inet_rcv_saddr = ireq->loc_addr;
1446         newinet->inet_saddr           = ireq->loc_addr;
1447         inet_opt              = ireq->opt;
1448         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1449         ireq->opt             = NULL;
1450         newinet->mc_index     = inet_iif(skb);
1451         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1452         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1453         if (inet_opt)
1454                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1455         newinet->inet_id = newtp->write_seq ^ jiffies;
1456
1457         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1458                 goto put_and_exit;
1459
1460         sk_setup_caps(newsk, dst);
1461
1462         tcp_mtup_init(newsk);
1463         tcp_sync_mss(newsk, dst_mtu(dst));
1464         newtp->advmss = dst_metric_advmss(dst);
1465         if (tcp_sk(sk)->rx_opt.user_mss &&
1466             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1467                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1468
1469         tcp_initialize_rcv_mss(newsk);
1470         if (tcp_rsk(req)->snt_synack)
1471                 tcp_valid_rtt_meas(newsk,
1472                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1473         newtp->total_retrans = req->retrans;
1474
1475 #ifdef CONFIG_TCP_MD5SIG
1476         /* Copy over the MD5 key from the original socket */
1477         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1478         if (key != NULL) {
1479                 /*
1480                  * We're using one, so create a matching key
1481                  * on the newsk structure. If we fail to get
1482                  * memory, then we end up not copying the key
1483                  * across. Shucks.
1484                  */
1485                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1486                 if (newkey != NULL)
1487                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1488                                           newkey, key->keylen);
1489                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1490         }
1491 #endif
1492
1493         if (__inet_inherit_port(sk, newsk) < 0)
1494                 goto put_and_exit;
1495         __inet_hash_nolisten(newsk, NULL);
1496
1497         return newsk;
1498
1499 exit_overflow:
1500         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501 exit_nonewsk:
1502         dst_release(dst);
1503 exit:
1504         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1505         return NULL;
1506 put_and_exit:
1507         sock_put(newsk);
1508         goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511
1512 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1513 {
1514         struct tcphdr *th = tcp_hdr(skb);
1515         const struct iphdr *iph = ip_hdr(skb);
1516         struct sock *nsk;
1517         struct request_sock **prev;
1518         /* Find possible connection requests. */
1519         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1520                                                        iph->saddr, iph->daddr);
1521         if (req)
1522                 return tcp_check_req(sk, skb, req, prev);
1523
1524         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1525                         th->source, iph->daddr, th->dest, inet_iif(skb));
1526
1527         if (nsk) {
1528                 if (nsk->sk_state != TCP_TIME_WAIT) {
1529                         bh_lock_sock(nsk);
1530                         return nsk;
1531                 }
1532                 inet_twsk_put(inet_twsk(nsk));
1533                 return NULL;
1534         }
1535
1536 #ifdef CONFIG_SYN_COOKIES
1537         if (!th->syn)
1538                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1539 #endif
1540         return sk;
1541 }
1542
1543 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1544 {
1545         const struct iphdr *iph = ip_hdr(skb);
1546
1547         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1548                 if (!tcp_v4_check(skb->len, iph->saddr,
1549                                   iph->daddr, skb->csum)) {
1550                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1551                         return 0;
1552                 }
1553         }
1554
1555         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1556                                        skb->len, IPPROTO_TCP, 0);
1557
1558         if (skb->len <= 76) {
1559                 return __skb_checksum_complete(skb);
1560         }
1561         return 0;
1562 }
1563
1564
1565 /* The socket must have it's spinlock held when we get
1566  * here.
1567  *
1568  * We have a potential double-lock case here, so even when
1569  * doing backlog processing we use the BH locking scheme.
1570  * This is because we cannot sleep with the original spinlock
1571  * held.
1572  */
1573 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1574 {
1575         struct sock *rsk;
1576 #ifdef CONFIG_TCP_MD5SIG
1577         /*
1578          * We really want to reject the packet as early as possible
1579          * if:
1580          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1581          *  o There is an MD5 option and we're not expecting one
1582          */
1583         if (tcp_v4_inbound_md5_hash(sk, skb))
1584                 goto discard;
1585 #endif
1586
1587         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1588                 sock_rps_save_rxhash(sk, skb->rxhash);
1589                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1590                         rsk = sk;
1591                         goto reset;
1592                 }
1593                 return 0;
1594         }
1595
1596         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1597                 goto csum_err;
1598
1599         if (sk->sk_state == TCP_LISTEN) {
1600                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1601                 if (!nsk)
1602                         goto discard;
1603
1604                 if (nsk != sk) {
1605                         sock_rps_save_rxhash(nsk, skb->rxhash);
1606                         if (tcp_child_process(sk, nsk, skb)) {
1607                                 rsk = nsk;
1608                                 goto reset;
1609                         }
1610                         return 0;
1611                 }
1612         } else
1613                 sock_rps_save_rxhash(sk, skb->rxhash);
1614
1615         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1616                 rsk = sk;
1617                 goto reset;
1618         }
1619         return 0;
1620
1621 reset:
1622         tcp_v4_send_reset(rsk, skb);
1623 discard:
1624         kfree_skb(skb);
1625         /* Be careful here. If this function gets more complicated and
1626          * gcc suffers from register pressure on the x86, sk (in %ebx)
1627          * might be destroyed here. This current version compiles correctly,
1628          * but you have been warned.
1629          */
1630         return 0;
1631
1632 csum_err:
1633         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1634         goto discard;
1635 }
1636 EXPORT_SYMBOL(tcp_v4_do_rcv);
1637
1638 /*
1639  *      From tcp_input.c
1640  */
1641
1642 int tcp_v4_rcv(struct sk_buff *skb)
1643 {
1644         const struct iphdr *iph;
1645         struct tcphdr *th;
1646         struct sock *sk;
1647         int ret;
1648         struct net *net = dev_net(skb->dev);
1649
1650         if (skb->pkt_type != PACKET_HOST)
1651                 goto discard_it;
1652
1653         /* Count it even if it's bad */
1654         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1655
1656         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1657                 goto discard_it;
1658
1659         th = tcp_hdr(skb);
1660
1661         if (th->doff < sizeof(struct tcphdr) / 4)
1662                 goto bad_packet;
1663         if (!pskb_may_pull(skb, th->doff * 4))
1664                 goto discard_it;
1665
1666         /* An explanation is required here, I think.
1667          * Packet length and doff are validated by header prediction,
1668          * provided case of th->doff==0 is eliminated.
1669          * So, we defer the checks. */
1670         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1671                 goto bad_packet;
1672
1673         th = tcp_hdr(skb);
1674         iph = ip_hdr(skb);
1675         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1676         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1677                                     skb->len - th->doff * 4);
1678         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1679         TCP_SKB_CB(skb)->when    = 0;
1680         TCP_SKB_CB(skb)->flags   = iph->tos;
1681         TCP_SKB_CB(skb)->sacked  = 0;
1682
1683         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1684         if (!sk)
1685                 goto no_tcp_socket;
1686
1687 process:
1688         if (sk->sk_state == TCP_TIME_WAIT)
1689                 goto do_time_wait;
1690
1691         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1692                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1693                 goto discard_and_relse;
1694         }
1695
1696         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1697                 goto discard_and_relse;
1698         nf_reset(skb);
1699
1700         if (sk_filter(sk, skb))
1701                 goto discard_and_relse;
1702
1703         skb->dev = NULL;
1704
1705         bh_lock_sock_nested(sk);
1706         ret = 0;
1707         if (!sock_owned_by_user(sk)) {
1708 #ifdef CONFIG_NET_DMA
1709                 struct tcp_sock *tp = tcp_sk(sk);
1710                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1711                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1712                 if (tp->ucopy.dma_chan)
1713                         ret = tcp_v4_do_rcv(sk, skb);
1714                 else
1715 #endif
1716                 {
1717                         if (!tcp_prequeue(sk, skb))
1718                                 ret = tcp_v4_do_rcv(sk, skb);
1719                 }
1720         } else if (unlikely(sk_add_backlog(sk, skb))) {
1721                 bh_unlock_sock(sk);
1722                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1723                 goto discard_and_relse;
1724         }
1725         bh_unlock_sock(sk);
1726
1727         sock_put(sk);
1728
1729         return ret;
1730
1731 no_tcp_socket:
1732         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1733                 goto discard_it;
1734
1735         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736 bad_packet:
1737                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1738         } else {
1739                 tcp_v4_send_reset(NULL, skb);
1740         }
1741
1742 discard_it:
1743         /* Discard frame. */
1744         kfree_skb(skb);
1745         return 0;
1746
1747 discard_and_relse:
1748         sock_put(sk);
1749         goto discard_it;
1750
1751 do_time_wait:
1752         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1753                 inet_twsk_put(inet_twsk(sk));
1754                 goto discard_it;
1755         }
1756
1757         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1758                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1759                 inet_twsk_put(inet_twsk(sk));
1760                 goto discard_it;
1761         }
1762         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1763         case TCP_TW_SYN: {
1764                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1765                                                         &tcp_hashinfo,
1766                                                         iph->daddr, th->dest,
1767                                                         inet_iif(skb));
1768                 if (sk2) {
1769                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1770                         inet_twsk_put(inet_twsk(sk));
1771                         sk = sk2;
1772                         goto process;
1773                 }
1774                 /* Fall through to ACK */
1775         }
1776         case TCP_TW_ACK:
1777                 tcp_v4_timewait_ack(sk, skb);
1778                 break;
1779         case TCP_TW_RST:
1780                 goto no_tcp_socket;
1781         case TCP_TW_SUCCESS:;
1782         }
1783         goto discard_it;
1784 }
1785
1786 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1787 {
1788         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1789         struct inet_sock *inet = inet_sk(sk);
1790         struct inet_peer *peer;
1791
1792         if (!rt ||
1793             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1794                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1795                 *release_it = true;
1796         } else {
1797                 if (!rt->peer)
1798                         rt_bind_peer(rt, inet->inet_daddr, 1);
1799                 peer = rt->peer;
1800                 *release_it = false;
1801         }
1802
1803         return peer;
1804 }
1805 EXPORT_SYMBOL(tcp_v4_get_peer);
1806
1807 void *tcp_v4_tw_get_peer(struct sock *sk)
1808 {
1809         struct inet_timewait_sock *tw = inet_twsk(sk);
1810
1811         return inet_getpeer_v4(tw->tw_daddr, 1);
1812 }
1813 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1814
1815 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1816         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1817         .twsk_unique    = tcp_twsk_unique,
1818         .twsk_destructor= tcp_twsk_destructor,
1819         .twsk_getpeer   = tcp_v4_tw_get_peer,
1820 };
1821
1822 const struct inet_connection_sock_af_ops ipv4_specific = {
1823         .queue_xmit        = ip_queue_xmit,
1824         .send_check        = tcp_v4_send_check,
1825         .rebuild_header    = inet_sk_rebuild_header,
1826         .conn_request      = tcp_v4_conn_request,
1827         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1828         .get_peer          = tcp_v4_get_peer,
1829         .net_header_len    = sizeof(struct iphdr),
1830         .setsockopt        = ip_setsockopt,
1831         .getsockopt        = ip_getsockopt,
1832         .addr2sockaddr     = inet_csk_addr2sockaddr,
1833         .sockaddr_len      = sizeof(struct sockaddr_in),
1834         .bind_conflict     = inet_csk_bind_conflict,
1835 #ifdef CONFIG_COMPAT
1836         .compat_setsockopt = compat_ip_setsockopt,
1837         .compat_getsockopt = compat_ip_getsockopt,
1838 #endif
1839 };
1840 EXPORT_SYMBOL(ipv4_specific);
1841
1842 #ifdef CONFIG_TCP_MD5SIG
1843 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1844         .md5_lookup             = tcp_v4_md5_lookup,
1845         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1846         .md5_add                = tcp_v4_md5_add_func,
1847         .md5_parse              = tcp_v4_parse_md5_keys,
1848 };
1849 #endif
1850
1851 /* NOTE: A lot of things set to zero explicitly by call to
1852  *       sk_alloc() so need not be done here.
1853  */
1854 static int tcp_v4_init_sock(struct sock *sk)
1855 {
1856         struct inet_connection_sock *icsk = inet_csk(sk);
1857         struct tcp_sock *tp = tcp_sk(sk);
1858
1859         skb_queue_head_init(&tp->out_of_order_queue);
1860         tcp_init_xmit_timers(sk);
1861         tcp_prequeue_init(tp);
1862
1863         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1864         tp->mdev = TCP_TIMEOUT_INIT;
1865
1866         /* So many TCP implementations out there (incorrectly) count the
1867          * initial SYN frame in their delayed-ACK and congestion control
1868          * algorithms that we must have the following bandaid to talk
1869          * efficiently to them.  -DaveM
1870          */
1871         tp->snd_cwnd = TCP_INIT_CWND;
1872
1873         /* See draft-stevens-tcpca-spec-01 for discussion of the
1874          * initialization of these values.
1875          */
1876         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1877         tp->snd_cwnd_clamp = ~0;
1878         tp->mss_cache = TCP_MSS_DEFAULT;
1879
1880         tp->reordering = sysctl_tcp_reordering;
1881         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1882
1883         sk->sk_state = TCP_CLOSE;
1884
1885         sk->sk_write_space = sk_stream_write_space;
1886         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1887
1888         icsk->icsk_af_ops = &ipv4_specific;
1889         icsk->icsk_sync_mss = tcp_sync_mss;
1890 #ifdef CONFIG_TCP_MD5SIG
1891         tp->af_specific = &tcp_sock_ipv4_specific;
1892 #endif
1893
1894         /* TCP Cookie Transactions */
1895         if (sysctl_tcp_cookie_size > 0) {
1896                 /* Default, cookies without s_data_payload. */
1897                 tp->cookie_values =
1898                         kzalloc(sizeof(*tp->cookie_values),
1899                                 sk->sk_allocation);
1900                 if (tp->cookie_values != NULL)
1901                         kref_init(&tp->cookie_values->kref);
1902         }
1903         /* Presumed zeroed, in order of appearance:
1904          *      cookie_in_always, cookie_out_never,
1905          *      s_data_constant, s_data_in, s_data_out
1906          */
1907         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1908         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1909
1910         local_bh_disable();
1911         percpu_counter_inc(&tcp_sockets_allocated);
1912         local_bh_enable();
1913
1914         return 0;
1915 }
1916
1917 void tcp_v4_destroy_sock(struct sock *sk)
1918 {
1919         struct tcp_sock *tp = tcp_sk(sk);
1920
1921         tcp_clear_xmit_timers(sk);
1922
1923         tcp_cleanup_congestion_control(sk);
1924
1925         /* Cleanup up the write buffer. */
1926         tcp_write_queue_purge(sk);
1927
1928         /* Cleans up our, hopefully empty, out_of_order_queue. */
1929         __skb_queue_purge(&tp->out_of_order_queue);
1930
1931 #ifdef CONFIG_TCP_MD5SIG
1932         /* Clean up the MD5 key list, if any */
1933         if (tp->md5sig_info) {
1934                 tcp_v4_clear_md5_list(sk);
1935                 kfree(tp->md5sig_info);
1936                 tp->md5sig_info = NULL;
1937         }
1938 #endif
1939
1940 #ifdef CONFIG_NET_DMA
1941         /* Cleans up our sk_async_wait_queue */
1942         __skb_queue_purge(&sk->sk_async_wait_queue);
1943 #endif
1944
1945         /* Clean prequeue, it must be empty really */
1946         __skb_queue_purge(&tp->ucopy.prequeue);
1947
1948         /* Clean up a referenced TCP bind bucket. */
1949         if (inet_csk(sk)->icsk_bind_hash)
1950                 inet_put_port(sk);
1951
1952         /*
1953          * If sendmsg cached page exists, toss it.
1954          */
1955         if (sk->sk_sndmsg_page) {
1956                 __free_page(sk->sk_sndmsg_page);
1957                 sk->sk_sndmsg_page = NULL;
1958         }
1959
1960         /* TCP Cookie Transactions */
1961         if (tp->cookie_values != NULL) {
1962                 kref_put(&tp->cookie_values->kref,
1963                          tcp_cookie_values_release);
1964                 tp->cookie_values = NULL;
1965         }
1966
1967         percpu_counter_dec(&tcp_sockets_allocated);
1968 }
1969 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970
1971 #ifdef CONFIG_PROC_FS
1972 /* Proc filesystem TCP sock list dumping. */
1973
1974 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1975 {
1976         return hlist_nulls_empty(head) ? NULL :
1977                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1978 }
1979
1980 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1981 {
1982         return !is_a_nulls(tw->tw_node.next) ?
1983                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1984 }
1985
1986 /*
1987  * Get next listener socket follow cur.  If cur is NULL, get first socket
1988  * starting from bucket given in st->bucket; when st->bucket is zero the
1989  * very first socket in the hash table is returned.
1990  */
1991 static void *listening_get_next(struct seq_file *seq, void *cur)
1992 {
1993         struct inet_connection_sock *icsk;
1994         struct hlist_nulls_node *node;
1995         struct sock *sk = cur;
1996         struct inet_listen_hashbucket *ilb;
1997         struct tcp_iter_state *st = seq->private;
1998         struct net *net = seq_file_net(seq);
1999
2000         if (!sk) {
2001                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2002                 spin_lock_bh(&ilb->lock);
2003                 sk = sk_nulls_head(&ilb->head);
2004                 st->offset = 0;
2005                 goto get_sk;
2006         }
2007         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2008         ++st->num;
2009         ++st->offset;
2010
2011         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012                 struct request_sock *req = cur;
2013
2014                 icsk = inet_csk(st->syn_wait_sk);
2015                 req = req->dl_next;
2016                 while (1) {
2017                         while (req) {
2018                                 if (req->rsk_ops->family == st->family) {
2019                                         cur = req;
2020                                         goto out;
2021                                 }
2022                                 req = req->dl_next;
2023                         }
2024                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2025                                 break;
2026 get_req:
2027                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2028                 }
2029                 sk        = sk_nulls_next(st->syn_wait_sk);
2030                 st->state = TCP_SEQ_STATE_LISTENING;
2031                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2032         } else {
2033                 icsk = inet_csk(sk);
2034                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2036                         goto start_req;
2037                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038                 sk = sk_nulls_next(sk);
2039         }
2040 get_sk:
2041         sk_nulls_for_each_from(sk, node) {
2042                 if (!net_eq(sock_net(sk), net))
2043                         continue;
2044                 if (sk->sk_family == st->family) {
2045                         cur = sk;
2046                         goto out;
2047                 }
2048                 icsk = inet_csk(sk);
2049                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2051 start_req:
2052                         st->uid         = sock_i_uid(sk);
2053                         st->syn_wait_sk = sk;
2054                         st->state       = TCP_SEQ_STATE_OPENREQ;
2055                         st->sbucket     = 0;
2056                         goto get_req;
2057                 }
2058                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059         }
2060         spin_unlock_bh(&ilb->lock);
2061         st->offset = 0;
2062         if (++st->bucket < INET_LHTABLE_SIZE) {
2063                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2064                 spin_lock_bh(&ilb->lock);
2065                 sk = sk_nulls_head(&ilb->head);
2066                 goto get_sk;
2067         }
2068         cur = NULL;
2069 out:
2070         return cur;
2071 }
2072
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc;
2077
2078         st->bucket = 0;
2079         st->offset = 0;
2080         rc = listening_get_next(seq, NULL);
2081
2082         while (rc && *pos) {
2083                 rc = listening_get_next(seq, rc);
2084                 --*pos;
2085         }
2086         return rc;
2087 }
2088
2089 static inline int empty_bucket(struct tcp_iter_state *st)
2090 {
2091         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2092                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2093 }
2094
2095 /*
2096  * Get first established socket starting from bucket given in st->bucket.
2097  * If st->bucket is zero, the very first socket in the hash is returned.
2098  */
2099 static void *established_get_first(struct seq_file *seq)
2100 {
2101         struct tcp_iter_state *st = seq->private;
2102         struct net *net = seq_file_net(seq);
2103         void *rc = NULL;
2104
2105         st->offset = 0;
2106         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2107                 struct sock *sk;
2108                 struct hlist_nulls_node *node;
2109                 struct inet_timewait_sock *tw;
2110                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2111
2112                 /* Lockless fast path for the common case of empty buckets */
2113                 if (empty_bucket(st))
2114                         continue;
2115
2116                 spin_lock_bh(lock);
2117                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2118                         if (sk->sk_family != st->family ||
2119                             !net_eq(sock_net(sk), net)) {
2120                                 continue;
2121                         }
2122                         rc = sk;
2123                         goto out;
2124                 }
2125                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2126                 inet_twsk_for_each(tw, node,
2127                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2128                         if (tw->tw_family != st->family ||
2129                             !net_eq(twsk_net(tw), net)) {
2130                                 continue;
2131                         }
2132                         rc = tw;
2133                         goto out;
2134                 }
2135                 spin_unlock_bh(lock);
2136                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137         }
2138 out:
2139         return rc;
2140 }
2141
2142 static void *established_get_next(struct seq_file *seq, void *cur)
2143 {
2144         struct sock *sk = cur;
2145         struct inet_timewait_sock *tw;
2146         struct hlist_nulls_node *node;
2147         struct tcp_iter_state *st = seq->private;
2148         struct net *net = seq_file_net(seq);
2149
2150         ++st->num;
2151         ++st->offset;
2152
2153         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2154                 tw = cur;
2155                 tw = tw_next(tw);
2156 get_tw:
2157                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2158                         tw = tw_next(tw);
2159                 }
2160                 if (tw) {
2161                         cur = tw;
2162                         goto out;
2163                 }
2164                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2166
2167                 /* Look for next non empty bucket */
2168                 st->offset = 0;
2169                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2170                                 empty_bucket(st))
2171                         ;
2172                 if (st->bucket > tcp_hashinfo.ehash_mask)
2173                         return NULL;
2174
2175                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2176                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2177         } else
2178                 sk = sk_nulls_next(sk);
2179
2180         sk_nulls_for_each_from(sk, node) {
2181                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2182                         goto found;
2183         }
2184
2185         st->state = TCP_SEQ_STATE_TIME_WAIT;
2186         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2187         goto get_tw;
2188 found:
2189         cur = sk;
2190 out:
2191         return cur;
2192 }
2193
2194 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196         struct tcp_iter_state *st = seq->private;
2197         void *rc;
2198
2199         st->bucket = 0;
2200         rc = established_get_first(seq);
2201
2202         while (rc && pos) {
2203                 rc = established_get_next(seq, rc);
2204                 --pos;
2205         }
2206         return rc;
2207 }
2208
2209 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2210 {
2211         void *rc;
2212         struct tcp_iter_state *st = seq->private;
2213
2214         st->state = TCP_SEQ_STATE_LISTENING;
2215         rc        = listening_get_idx(seq, &pos);
2216
2217         if (!rc) {
2218                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2219                 rc        = established_get_idx(seq, pos);
2220         }
2221
2222         return rc;
2223 }
2224
2225 static void *tcp_seek_last_pos(struct seq_file *seq)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         int offset = st->offset;
2229         int orig_num = st->num;
2230         void *rc = NULL;
2231
2232         switch (st->state) {
2233         case TCP_SEQ_STATE_OPENREQ:
2234         case TCP_SEQ_STATE_LISTENING:
2235                 if (st->bucket >= INET_LHTABLE_SIZE)
2236                         break;
2237                 st->state = TCP_SEQ_STATE_LISTENING;
2238                 rc = listening_get_next(seq, NULL);
2239                 while (offset-- && rc)
2240                         rc = listening_get_next(seq, rc);
2241                 if (rc)
2242                         break;
2243                 st->bucket = 0;
2244                 /* Fallthrough */
2245         case TCP_SEQ_STATE_ESTABLISHED:
2246         case TCP_SEQ_STATE_TIME_WAIT:
2247                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2248                 if (st->bucket > tcp_hashinfo.ehash_mask)
2249                         break;
2250                 rc = established_get_first(seq);
2251                 while (offset-- && rc)
2252                         rc = established_get_next(seq, rc);
2253         }
2254
2255         st->num = orig_num;
2256
2257         return rc;
2258 }
2259
2260 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2261 {
2262         struct tcp_iter_state *st = seq->private;
2263         void *rc;
2264
2265         if (*pos && *pos == st->last_pos) {
2266                 rc = tcp_seek_last_pos(seq);
2267                 if (rc)
2268                         goto out;
2269         }
2270
2271         st->state = TCP_SEQ_STATE_LISTENING;
2272         st->num = 0;
2273         st->bucket = 0;
2274         st->offset = 0;
2275         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276
2277 out:
2278         st->last_pos = *pos;
2279         return rc;
2280 }
2281
2282 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2283 {
2284         struct tcp_iter_state *st = seq->private;
2285         void *rc = NULL;
2286
2287         if (v == SEQ_START_TOKEN) {
2288                 rc = tcp_get_idx(seq, 0);
2289                 goto out;
2290         }
2291
2292         switch (st->state) {
2293         case TCP_SEQ_STATE_OPENREQ:
2294         case TCP_SEQ_STATE_LISTENING:
2295                 rc = listening_get_next(seq, v);
2296                 if (!rc) {
2297                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2298                         st->bucket = 0;
2299                         st->offset = 0;
2300                         rc        = established_get_first(seq);
2301                 }
2302                 break;
2303         case TCP_SEQ_STATE_ESTABLISHED:
2304         case TCP_SEQ_STATE_TIME_WAIT:
2305                 rc = established_get_next(seq, v);
2306                 break;
2307         }
2308 out:
2309         ++*pos;
2310         st->last_pos = *pos;
2311         return rc;
2312 }
2313
2314 static void tcp_seq_stop(struct seq_file *seq, void *v)
2315 {
2316         struct tcp_iter_state *st = seq->private;
2317
2318         switch (st->state) {
2319         case TCP_SEQ_STATE_OPENREQ:
2320                 if (v) {
2321                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2322                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2323                 }
2324         case TCP_SEQ_STATE_LISTENING:
2325                 if (v != SEQ_START_TOKEN)
2326                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2327                 break;
2328         case TCP_SEQ_STATE_TIME_WAIT:
2329         case TCP_SEQ_STATE_ESTABLISHED:
2330                 if (v)
2331                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2332                 break;
2333         }
2334 }
2335
2336 static int tcp_seq_open(struct inode *inode, struct file *file)
2337 {
2338         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2339         struct tcp_iter_state *s;
2340         int err;
2341
2342         err = seq_open_net(inode, file, &afinfo->seq_ops,
2343                           sizeof(struct tcp_iter_state));
2344         if (err < 0)
2345                 return err;
2346
2347         s = ((struct seq_file *)file->private_data)->private;
2348         s->family               = afinfo->family;
2349         s->last_pos             = 0;
2350         return 0;
2351 }
2352
2353 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2354 {
2355         int rc = 0;
2356         struct proc_dir_entry *p;
2357
2358         afinfo->seq_fops.open           = tcp_seq_open;
2359         afinfo->seq_fops.read           = seq_read;
2360         afinfo->seq_fops.llseek         = seq_lseek;
2361         afinfo->seq_fops.release        = seq_release_net;
2362
2363         afinfo->seq_ops.start           = tcp_seq_start;
2364         afinfo->seq_ops.next            = tcp_seq_next;
2365         afinfo->seq_ops.stop            = tcp_seq_stop;
2366
2367         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2368                              &afinfo->seq_fops, afinfo);
2369         if (!p)
2370                 rc = -ENOMEM;
2371         return rc;
2372 }
2373 EXPORT_SYMBOL(tcp_proc_register);
2374
2375 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377         proc_net_remove(net, afinfo->name);
2378 }
2379 EXPORT_SYMBOL(tcp_proc_unregister);
2380
2381 static void get_openreq4(struct sock *sk, struct request_sock *req,
2382                          struct seq_file *f, int i, int uid, int *len)
2383 {
2384         const struct inet_request_sock *ireq = inet_rsk(req);
2385         int ttd = req->expires - jiffies;
2386
2387         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2389                 i,
2390                 ireq->loc_addr,
2391                 ntohs(inet_sk(sk)->inet_sport),
2392                 ireq->rmt_addr,
2393                 ntohs(ireq->rmt_port),
2394                 TCP_SYN_RECV,
2395                 0, 0, /* could print option size, but that is af dependent. */
2396                 1,    /* timers active (only the expire timer) */
2397                 jiffies_to_clock_t(ttd),
2398                 req->retrans,
2399                 uid,
2400                 0,  /* non standard timer */
2401                 0, /* open_requests have no inode */
2402                 atomic_read(&sk->sk_refcnt),
2403                 req,
2404                 len);
2405 }
2406
2407 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2408 {
2409         int timer_active;
2410         unsigned long timer_expires;
2411         struct tcp_sock *tp = tcp_sk(sk);
2412         const struct inet_connection_sock *icsk = inet_csk(sk);
2413         struct inet_sock *inet = inet_sk(sk);
2414         __be32 dest = inet->inet_daddr;
2415         __be32 src = inet->inet_rcv_saddr;
2416         __u16 destp = ntohs(inet->inet_dport);
2417         __u16 srcp = ntohs(inet->inet_sport);
2418         int rx_queue;
2419
2420         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2421                 timer_active    = 1;
2422                 timer_expires   = icsk->icsk_timeout;
2423         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2424                 timer_active    = 4;
2425                 timer_expires   = icsk->icsk_timeout;
2426         } else if (timer_pending(&sk->sk_timer)) {
2427                 timer_active    = 2;
2428                 timer_expires   = sk->sk_timer.expires;
2429         } else {
2430                 timer_active    = 0;
2431                 timer_expires = jiffies;
2432         }
2433
2434         if (sk->sk_state == TCP_LISTEN)
2435                 rx_queue = sk->sk_ack_backlog;
2436         else
2437                 /*
2438                  * because we dont lock socket, we might find a transient negative value
2439                  */
2440                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441
2442         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2444                 i, src, srcp, dest, destp, sk->sk_state,
2445                 tp->write_seq - tp->snd_una,
2446                 rx_queue,
2447                 timer_active,
2448                 jiffies_to_clock_t(timer_expires - jiffies),
2449                 icsk->icsk_retransmits,
2450                 sock_i_uid(sk),
2451                 icsk->icsk_probes_out,
2452                 sock_i_ino(sk),
2453                 atomic_read(&sk->sk_refcnt), sk,
2454                 jiffies_to_clock_t(icsk->icsk_rto),
2455                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2456                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2457                 tp->snd_cwnd,
2458                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2459                 len);
2460 }
2461
2462 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2463                                struct seq_file *f, int i, int *len)
2464 {
2465         __be32 dest, src;
2466         __u16 destp, srcp;
2467         int ttd = tw->tw_ttd - jiffies;
2468
2469         if (ttd < 0)
2470                 ttd = 0;
2471
2472         dest  = tw->tw_daddr;
2473         src   = tw->tw_rcv_saddr;
2474         destp = ntohs(tw->tw_dport);
2475         srcp  = ntohs(tw->tw_sport);
2476
2477         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2479                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481                 atomic_read(&tw->tw_refcnt), tw, len);
2482 }
2483
2484 #define TMPSZ 150
2485
2486 static int tcp4_seq_show(struct seq_file *seq, void *v)
2487 {
2488         struct tcp_iter_state *st;
2489         int len;
2490
2491         if (v == SEQ_START_TOKEN) {
2492                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2493                            "  sl  local_address rem_address   st tx_queue "
2494                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2495                            "inode");
2496                 goto out;
2497         }
2498         st = seq->private;
2499
2500         switch (st->state) {
2501         case TCP_SEQ_STATE_LISTENING:
2502         case TCP_SEQ_STATE_ESTABLISHED:
2503                 get_tcp4_sock(v, seq, st->num, &len);
2504                 break;
2505         case TCP_SEQ_STATE_OPENREQ:
2506                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2507                 break;
2508         case TCP_SEQ_STATE_TIME_WAIT:
2509                 get_timewait4_sock(v, seq, st->num, &len);
2510                 break;
2511         }
2512         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2513 out:
2514         return 0;
2515 }
2516
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518         .name           = "tcp",
2519         .family         = AF_INET,
2520         .seq_fops       = {
2521                 .owner          = THIS_MODULE,
2522         },
2523         .seq_ops        = {
2524                 .show           = tcp4_seq_show,
2525         },
2526 };
2527
2528 static int __net_init tcp4_proc_init_net(struct net *net)
2529 {
2530         return tcp_proc_register(net, &tcp4_seq_afinfo);
2531 }
2532
2533 static void __net_exit tcp4_proc_exit_net(struct net *net)
2534 {
2535         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2536 }
2537
2538 static struct pernet_operations tcp4_net_ops = {
2539         .init = tcp4_proc_init_net,
2540         .exit = tcp4_proc_exit_net,
2541 };
2542
2543 int __init tcp4_proc_init(void)
2544 {
2545         return register_pernet_subsys(&tcp4_net_ops);
2546 }
2547
2548 void tcp4_proc_exit(void)
2549 {
2550         unregister_pernet_subsys(&tcp4_net_ops);
2551 }
2552 #endif /* CONFIG_PROC_FS */
2553
2554 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555 {
2556         const struct iphdr *iph = skb_gro_network_header(skb);
2557
2558         switch (skb->ip_summed) {
2559         case CHECKSUM_COMPLETE:
2560                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2561                                   skb->csum)) {
2562                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2563                         break;
2564                 }
2565
2566                 /* fall through */
2567         case CHECKSUM_NONE:
2568                 NAPI_GRO_CB(skb)->flush = 1;
2569                 return NULL;
2570         }
2571
2572         return tcp_gro_receive(head, skb);
2573 }
2574
2575 int tcp4_gro_complete(struct sk_buff *skb)
2576 {
2577         const struct iphdr *iph = ip_hdr(skb);
2578         struct tcphdr *th = tcp_hdr(skb);
2579
2580         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2581                                   iph->saddr, iph->daddr, 0);
2582         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2583
2584         return tcp_gro_complete(skb);
2585 }
2586
2587 struct proto tcp_prot = {
2588         .name                   = "TCP",
2589         .owner                  = THIS_MODULE,
2590         .close                  = tcp_close,
2591         .connect                = tcp_v4_connect,
2592         .disconnect             = tcp_disconnect,
2593         .accept                 = inet_csk_accept,
2594         .ioctl                  = tcp_ioctl,
2595         .init                   = tcp_v4_init_sock,
2596         .destroy                = tcp_v4_destroy_sock,
2597         .shutdown               = tcp_shutdown,
2598         .setsockopt             = tcp_setsockopt,
2599         .getsockopt             = tcp_getsockopt,
2600         .recvmsg                = tcp_recvmsg,
2601         .sendmsg                = tcp_sendmsg,
2602         .sendpage               = tcp_sendpage,
2603         .backlog_rcv            = tcp_v4_do_rcv,
2604         .hash                   = inet_hash,
2605         .unhash                 = inet_unhash,
2606         .get_port               = inet_csk_get_port,
2607         .enter_memory_pressure  = tcp_enter_memory_pressure,
2608         .sockets_allocated      = &tcp_sockets_allocated,
2609         .orphan_count           = &tcp_orphan_count,
2610         .memory_allocated       = &tcp_memory_allocated,
2611         .memory_pressure        = &tcp_memory_pressure,
2612         .sysctl_mem             = sysctl_tcp_mem,
2613         .sysctl_wmem            = sysctl_tcp_wmem,
2614         .sysctl_rmem            = sysctl_tcp_rmem,
2615         .max_header             = MAX_TCP_HEADER,
2616         .obj_size               = sizeof(struct tcp_sock),
2617         .slab_flags             = SLAB_DESTROY_BY_RCU,
2618         .twsk_prot              = &tcp_timewait_sock_ops,
2619         .rsk_prot               = &tcp_request_sock_ops,
2620         .h.hashinfo             = &tcp_hashinfo,
2621         .no_autobind            = true,
2622 #ifdef CONFIG_COMPAT
2623         .compat_setsockopt      = compat_tcp_setsockopt,
2624         .compat_getsockopt      = compat_tcp_getsockopt,
2625 #endif
2626 };
2627 EXPORT_SYMBOL(tcp_prot);
2628
2629
2630 static int __net_init tcp_sk_init(struct net *net)
2631 {
2632         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2633                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2634 }
2635
2636 static void __net_exit tcp_sk_exit(struct net *net)
2637 {
2638         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2639 }
2640
2641 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2642 {
2643         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2644 }
2645
2646 static struct pernet_operations __net_initdata tcp_sk_ops = {
2647        .init       = tcp_sk_init,
2648        .exit       = tcp_sk_exit,
2649        .exit_batch = tcp_sk_exit_batch,
2650 };
2651
2652 void __init tcp_v4_init(void)
2653 {
2654         inet_hashinfo_init(&tcp_hashinfo);
2655         if (register_pernet_subsys(&tcp_sk_ops))
2656                 panic("Failed to create the TCP control socket.\n");
2657 }