net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  91                                                    __be32 addr);
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  94 #else
  95 static inline
  96 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  97 {
  98         return NULL;
  99 }
 100 #endif
 101
 102 struct inet_hashinfo tcp_hashinfo;
 103
 104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 105 {
 106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 107                                           ip_hdr(skb)->saddr,
 108                                           tcp_hdr(skb)->dest,
 109                                           tcp_hdr(skb)->source);
 110 }
 111
 112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 113 {
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116
 117         /* With PAWS, it is safe from the viewpoint
 118            of data integrity. Even without PAWS it is safe provided sequence
 119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 120
 121            Actually, the idea is close to VJ's one, only timestamp cache is
 122            held not per host, but per port pair and TW bucket is used as state
 123            holder.
 124
 125            If TW bucket has been already destroyed we fall back to VJ's scheme
 126            and use initial timestamp retrieved from peer table.
 127          */
 128         if (tcptw->tw_ts_recent_stamp &&
 129             (twp == NULL || (sysctl_tcp_tw_reuse &&
 130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 132                 if (tp->write_seq == 0)
 133                         tp->write_seq = 1;
 134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 136                 sock_hold(sktw);
 137                 return 1;
 138         }
 139
 140         return 0;
 141 }
 142
 143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 144
 145 /* This will initiate an outgoing connection. */
 146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 147 {
 148         struct inet_sock *inet = inet_sk(sk);
 149         struct tcp_sock *tp = tcp_sk(sk);
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct rtable *rt;
 152         __be32 daddr, nexthop;
 153         int tmp;
 154         int err;
 155
 156         if (addr_len < sizeof(struct sockaddr_in))
 157                 return -EINVAL;
 158
 159         if (usin->sin_family != AF_INET)
 160                 return -EAFNOSUPPORT;
 161
 162         nexthop = daddr = usin->sin_addr.s_addr;
 163         if (inet->opt && inet->opt->srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet->opt->faddr;
 167         }
 168
 169         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                                IPPROTO_TCP,
 172                                inet->inet_sport, usin->sin_port, sk, 1);
 173         if (tmp < 0) {
 174                 if (tmp == -ENETUNREACH)
 175                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 176                 return tmp;
 177         }
 178
 179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 180                 ip_rt_put(rt);
 181                 return -ENETUNREACH;
 182         }
 183
 184         if (!inet->opt || !inet->opt->srr)
 185                 daddr = rt->rt_dst;
 186
 187         if (!inet->inet_saddr)
 188                 inet->inet_saddr = rt->rt_src;
 189         inet->inet_rcv_saddr = inet->inet_saddr;
 190
 191         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 192                 /* Reset inherited state */
 193                 tp->rx_opt.ts_recent       = 0;
 194                 tp->rx_opt.ts_recent_stamp = 0;
 195                 tp->write_seq              = 0;
 196         }
 197
 198         if (tcp_death_row.sysctl_tw_recycle &&
 199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 200                 struct inet_peer *peer = rt_get_peer(rt);
 201                 /*
 202                  * VJ's idea. We save last timestamp seen from
 203                  * the destination in peer table, when entering state
 204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 205                  * when trying new connection.
 206                  */
 207                 if (peer != NULL &&
 208                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 209                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 210                         tp->rx_opt.ts_recent = peer->tcp_ts;
 211                 }
 212         }
 213
 214         inet->inet_dport = usin->sin_port;
 215         inet->inet_daddr = daddr;
 216
 217         inet_csk(sk)->icsk_ext_hdr_len = 0;
 218         if (inet->opt)
 219                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 220
 221         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 222
 223         /* Socket identity is still unknown (sport may be zero).
 224          * However we set state to SYN-SENT and not releasing socket
 225          * lock select source port, enter ourselves into the hash tables and
 226          * complete initialization after this.
 227          */
 228         tcp_set_state(sk, TCP_SYN_SENT);
 229         err = inet_hash_connect(&tcp_death_row, sk);
 230         if (err)
 231                 goto failure;
 232
 233         err = ip_route_newports(&rt, IPPROTO_TCP,
 234                                 inet->inet_sport, inet->inet_dport, sk);
 235         if (err)
 236                 goto failure;
 237
 238         /* OK, now commit destination to socket.  */
 239         sk->sk_gso_type = SKB_GSO_TCPV4;
 240         sk_setup_caps(sk, &rt->u.dst);
 241
 242         if (!tp->write_seq)
 243                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 244                                                            inet->inet_daddr,
 245                                                            inet->inet_sport,
 246                                                            usin->sin_port);
 247
 248         inet->inet_id = tp->write_seq ^ jiffies;
 249
 250         err = tcp_connect(sk);
 251         rt = NULL;
 252         if (err)
 253                 goto failure;
 254
 255         return 0;
 256
 257 failure:
 258         /*
 259          * This unhashes the socket and releases the local port,
 260          * if necessary.
 261          */
 262         tcp_set_state(sk, TCP_CLOSE);
 263         ip_rt_put(rt);
 264         sk->sk_route_caps = 0;
 265         inet->inet_dport = 0;
 266         return err;
 267 }
 268
 269 /*
 270  * This routine does path mtu discovery as defined in RFC1191.
 271  */
 272 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 273 {
 274         struct dst_entry *dst;
 275         struct inet_sock *inet = inet_sk(sk);
 276
 277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 278          * send out by Linux are always <576bytes so they should go through
 279          * unfragmented).
 280          */
 281         if (sk->sk_state == TCP_LISTEN)
 282                 return;
 283
 284         /* We don't check in the destentry if pmtu discovery is forbidden
 285          * on this route. We just assume that no packet_to_big packets
 286          * are send back when pmtu discovery is not active.
 287          * There is a small race when the user changes this flag in the
 288          * route, but I think that's acceptable.
 289          */
 290         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 291                 return;
 292
 293         dst->ops->update_pmtu(dst, mtu);
 294
 295         /* Something is about to be wrong... Remember soft error
 296          * for the case, if this connection will not able to recover.
 297          */
 298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                 sk->sk_err_soft = EMSGSIZE;
 300
 301         mtu = dst_mtu(dst);
 302
 303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 305                 tcp_sync_mss(sk, mtu);
 306
 307                 /* Resend the TCP packet because it's
 308                  * clear that the old packet has been
 309                  * dropped. This is the new "fast" path mtu
 310                  * discovery.
 311                  */
 312                 tcp_simple_retransmit(sk);
 313         } /* else let the usual retransmit timer handle it */
 314 }
 315
 316 /*
 317  * This routine is called by the ICMP module when it gets some
 318  * sort of error condition.  If err < 0 then the socket should
 319  * be closed and the error returned to the user.  If err > 0
 320  * it's just the icmp type << 8 | icmp code.  After adjustment
 321  * header points to the first 8 bytes of the tcp header.  We need
 322  * to find the appropriate port.
 323  *
 324  * The locking strategy used here is very "optimistic". When
 325  * someone else accesses the socket the ICMP is just dropped
 326  * and for some paths there is no check at all.
 327  * A more general error queue to queue errors for later handling
 328  * is probably better.
 329  *
 330  */
 331
 332 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 333 {
 334         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 335         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 336         struct inet_connection_sock *icsk;
 337         struct tcp_sock *tp;
 338         struct inet_sock *inet;
 339         const int type = icmp_hdr(icmp_skb)->type;
 340         const int code = icmp_hdr(icmp_skb)->code;
 341         struct sock *sk;
 342         struct sk_buff *skb;
 343         __u32 seq;
 344         __u32 remaining;
 345         int err;
 346         struct net *net = dev_net(icmp_skb->dev);
 347
 348         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 349                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 350                 return;
 351         }
 352
 353         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 354                         iph->saddr, th->source, inet_iif(icmp_skb));
 355         if (!sk) {
 356                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 357                 return;
 358         }
 359         if (sk->sk_state == TCP_TIME_WAIT) {
 360                 inet_twsk_put(inet_twsk(sk));
 361                 return;
 362         }
 363
 364         bh_lock_sock(sk);
 365         /* If too many ICMPs get dropped on busy
 366          * servers this needs to be solved differently.
 367          */
 368         if (sock_owned_by_user(sk))
 369                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 370
 371         if (sk->sk_state == TCP_CLOSE)
 372                 goto out;
 373
 374         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 375                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 376                 goto out;
 377         }
 378
 379         icsk = inet_csk(sk);
 380         tp = tcp_sk(sk);
 381         seq = ntohl(th->seq);
 382         if (sk->sk_state != TCP_LISTEN &&
 383             !between(seq, tp->snd_una, tp->snd_nxt)) {
 384                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 385                 goto out;
 386         }
 387
 388         switch (type) {
 389         case ICMP_SOURCE_QUENCH:
 390                 /* Just silently ignore these. */
 391                 goto out;
 392         case ICMP_PARAMETERPROB:
 393                 err = EPROTO;
 394                 break;
 395         case ICMP_DEST_UNREACH:
 396                 if (code > NR_ICMP_UNREACH)
 397                         goto out;
 398
 399                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 400                         if (!sock_owned_by_user(sk))
 401                                 do_pmtu_discovery(sk, iph, info);
 402                         goto out;
 403                 }
 404
 405                 err = icmp_err_convert[code].errno;
 406                 /* check if icmp_skb allows revert of backoff
 407                  * (see draft-zimmermann-tcp-lcd) */
 408                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 409                         break;
 410                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 411                     !icsk->icsk_backoff)
 412                         break;
 413
 414                 if (sock_owned_by_user(sk))
 415                         break;
 416
 417                 icsk->icsk_backoff--;
 418                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 419                                          icsk->icsk_backoff;
 420                 tcp_bound_rto(sk);
 421
 422                 skb = tcp_write_queue_head(sk);
 423                 BUG_ON(!skb);
 424
 425                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 426                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 427
 428                 if (remaining) {
 429                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 430                                                   remaining, TCP_RTO_MAX);
 431                 } else {
 432                         /* RTO revert clocked out retransmission.
 433                          * Will retransmit now */
 434                         tcp_retransmit_timer(sk);
 435                 }
 436
 437                 break;
 438         case ICMP_TIME_EXCEEDED:
 439                 err = EHOSTUNREACH;
 440                 break;
 441         default:
 442                 goto out;
 443         }
 444
 445         switch (sk->sk_state) {
 446                 struct request_sock *req, **prev;
 447         case TCP_LISTEN:
 448                 if (sock_owned_by_user(sk))
 449                         goto out;
 450
 451                 req = inet_csk_search_req(sk, &prev, th->dest,
 452                                           iph->daddr, iph->saddr);
 453                 if (!req)
 454                         goto out;
 455
 456                 /* ICMPs are not backlogged, hence we cannot get
 457                    an established socket here.
 458                  */
 459                 WARN_ON(req->sk);
 460
 461                 if (seq != tcp_rsk(req)->snt_isn) {
 462                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 463                         goto out;
 464                 }
 465
 466                 /*
 467                  * Still in SYN_RECV, just remove it silently.
 468                  * There is no good way to pass the error to the newly
 469                  * created socket, and POSIX does not want network
 470                  * errors returned from accept().
 471                  */
 472                 inet_csk_reqsk_queue_drop(sk, req, prev);
 473                 goto out;
 474
 475         case TCP_SYN_SENT:
 476         case TCP_SYN_RECV:  /* Cannot happen.
 477                                It can f.e. if SYNs crossed.
 478                              */
 479                 if (!sock_owned_by_user(sk)) {
 480                         sk->sk_err = err;
 481
 482                         sk->sk_error_report(sk);
 483
 484                         tcp_done(sk);
 485                 } else {
 486                         sk->sk_err_soft = err;
 487                 }
 488                 goto out;
 489         }
 490
 491         /* If we've already connected we will keep trying
 492          * until we time out, or the user gives up.
 493          *
 494          * rfc1122 4.2.3.9 allows to consider as hard errors
 495          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 496          * but it is obsoleted by pmtu discovery).
 497          *
 498          * Note, that in modern internet, where routing is unreliable
 499          * and in each dark corner broken firewalls sit, sending random
 500          * errors ordered by their masters even this two messages finally lose
 501          * their original sense (even Linux sends invalid PORT_UNREACHs)
 502          *
 503          * Now we are in compliance with RFCs.
 504          *                                                      --ANK (980905)
 505          */
 506
 507         inet = inet_sk(sk);
 508         if (!sock_owned_by_user(sk) && inet->recverr) {
 509                 sk->sk_err = err;
 510                 sk->sk_error_report(sk);
 511         } else  { /* Only an error on timeout */
 512                 sk->sk_err_soft = err;
 513         }
 514
 515 out:
 516         bh_unlock_sock(sk);
 517         sock_put(sk);
 518 }
 519
 520 static void __tcp_v4_send_check(struct sk_buff *skb,
 521                                 __be32 saddr, __be32 daddr)
 522 {
 523         struct tcphdr *th = tcp_hdr(skb);
 524
 525         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 526                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 527                 skb->csum_start = skb_transport_header(skb) - skb->head;
 528                 skb->csum_offset = offsetof(struct tcphdr, check);
 529         } else {
 530                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 531                                          csum_partial(th,
 532                                                       th->doff << 2,
 533                                                       skb->csum));
 534         }
 535 }
 536
 537 /* This routine computes an IPv4 TCP checksum. */
 538 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 539 {
 540         struct inet_sock *inet = inet_sk(sk);
 541
 542         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 543 }
 544
 545 int tcp_v4_gso_send_check(struct sk_buff *skb)
 546 {
 547         const struct iphdr *iph;
 548         struct tcphdr *th;
 549
 550         if (!pskb_may_pull(skb, sizeof(*th)))
 551                 return -EINVAL;
 552
 553         iph = ip_hdr(skb);
 554         th = tcp_hdr(skb);
 555
 556         th->check = 0;
 557         skb->ip_summed = CHECKSUM_PARTIAL;
 558         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 559         return 0;
 560 }
 561
 562 /*
 563  *      This routine will send an RST to the other tcp.
 564  *
 565  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 566  *                    for reset.
 567  *      Answer: if a packet caused RST, it is not for a socket
 568  *              existing in our system, if it is matched to a socket,
 569  *              it is just duplicate segment or bug in other side's TCP.
 570  *              So that we build reply only basing on parameters
 571  *              arrived with segment.
 572  *      Exception: precedence violation. We do not implement it in any case.
 573  */
 574
 575 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 576 {
 577         struct tcphdr *th = tcp_hdr(skb);
 578         struct {
 579                 struct tcphdr th;
 580 #ifdef CONFIG_TCP_MD5SIG
 581                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 582 #endif
 583         } rep;
 584         struct ip_reply_arg arg;
 585 #ifdef CONFIG_TCP_MD5SIG
 586         struct tcp_md5sig_key *key;
 587 #endif
 588         struct net *net;
 589
 590         /* Never send a reset in response to a reset. */
 591         if (th->rst)
 592                 return;
 593
 594         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 595                 return;
 596
 597         /* Swap the send and the receive. */
 598         memset(&rep, 0, sizeof(rep));
 599         rep.th.dest   = th->source;
 600         rep.th.source = th->dest;
 601         rep.th.doff   = sizeof(struct tcphdr) / 4;
 602         rep.th.rst    = 1;
 603
 604         if (th->ack) {
 605                 rep.th.seq = th->ack_seq;
 606         } else {
 607                 rep.th.ack = 1;
 608                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 609                                        skb->len - (th->doff << 2));
 610         }
 611
 612         memset(&arg, 0, sizeof(arg));
 613         arg.iov[0].iov_base = (unsigned char *)&rep;
 614         arg.iov[0].iov_len  = sizeof(rep.th);
 615
 616 #ifdef CONFIG_TCP_MD5SIG
 617         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 618         if (key) {
 619                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 620                                    (TCPOPT_NOP << 16) |
 621                                    (TCPOPT_MD5SIG << 8) |
 622                                    TCPOLEN_MD5SIG);
 623                 /* Update length and the length the header thinks exists */
 624                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 625                 rep.th.doff = arg.iov[0].iov_len / 4;
 626
 627                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 628                                      key, ip_hdr(skb)->saddr,
 629                                      ip_hdr(skb)->daddr, &rep.th);
 630         }
 631 #endif
 632         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 633                                       ip_hdr(skb)->saddr, /* XXX */
 634                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 635         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 636         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 637
 638         net = dev_net(skb_dst(skb)->dev);
 639         ip_send_reply(net->ipv4.tcp_sock, skb,
 640                       &arg, arg.iov[0].iov_len);
 641
 642         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 643         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 644 }
 645
 646 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 647    outside socket context is ugly, certainly. What can I do?
 648  */
 649
 650 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 651                             u32 win, u32 ts, int oif,
 652                             struct tcp_md5sig_key *key,
 653                             int reply_flags)
 654 {
 655         struct tcphdr *th = tcp_hdr(skb);
 656         struct {
 657                 struct tcphdr th;
 658                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 659 #ifdef CONFIG_TCP_MD5SIG
 660                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 661 #endif
 662                         ];
 663         } rep;
 664         struct ip_reply_arg arg;
 665         struct net *net = dev_net(skb_dst(skb)->dev);
 666
 667         memset(&rep.th, 0, sizeof(struct tcphdr));
 668         memset(&arg, 0, sizeof(arg));
 669
 670         arg.iov[0].iov_base = (unsigned char *)&rep;
 671         arg.iov[0].iov_len  = sizeof(rep.th);
 672         if (ts) {
 673                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 674                                    (TCPOPT_TIMESTAMP << 8) |
 675                                    TCPOLEN_TIMESTAMP);
 676                 rep.opt[1] = htonl(tcp_time_stamp);
 677                 rep.opt[2] = htonl(ts);
 678                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 679         }
 680
 681         /* Swap the send and the receive. */
 682         rep.th.dest    = th->source;
 683         rep.th.source  = th->dest;
 684         rep.th.doff    = arg.iov[0].iov_len / 4;
 685         rep.th.seq     = htonl(seq);
 686         rep.th.ack_seq = htonl(ack);
 687         rep.th.ack     = 1;
 688         rep.th.window  = htons(win);
 689
 690 #ifdef CONFIG_TCP_MD5SIG
 691         if (key) {
 692                 int offset = (ts) ? 3 : 0;
 693
 694                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 695                                           (TCPOPT_NOP << 16) |
 696                                           (TCPOPT_MD5SIG << 8) |
 697                                           TCPOLEN_MD5SIG);
 698                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 699                 rep.th.doff = arg.iov[0].iov_len/4;
 700
 701                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 702                                     key, ip_hdr(skb)->saddr,
 703                                     ip_hdr(skb)->daddr, &rep.th);
 704         }
 705 #endif
 706         arg.flags = reply_flags;
 707         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 708                                       ip_hdr(skb)->saddr, /* XXX */
 709                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 710         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 711         if (oif)
 712                 arg.bound_dev_if = oif;
 713
 714         ip_send_reply(net->ipv4.tcp_sock, skb,
 715                       &arg, arg.iov[0].iov_len);
 716
 717         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 718 }
 719
 720 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 721 {
 722         struct inet_timewait_sock *tw = inet_twsk(sk);
 723         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 724
 725         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 726                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 727                         tcptw->tw_ts_recent,
 728                         tw->tw_bound_dev_if,
 729                         tcp_twsk_md5_key(tcptw),
 730                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 731                         );
 732
 733         inet_twsk_put(tw);
 734 }
 735
 736 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 737                                   struct request_sock *req)
 738 {
 739         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 740                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 741                         req->ts_recent,
 742                         0,
 743                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 744                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 745 }
 746
 747 /*
 748  *      Send a SYN-ACK after having received a SYN.
 749  *      This still operates on a request_sock only, not on a big
 750  *      socket.
 751  */
 752 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 753                               struct request_sock *req,
 754                               struct request_values *rvp)
 755 {
 756         const struct inet_request_sock *ireq = inet_rsk(req);
 757         int err = -1;
 758         struct sk_buff * skb;
 759
 760         /* First, grab a route. */
 761         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 762                 return -1;
 763
 764         skb = tcp_make_synack(sk, dst, req, rvp);
 765
 766         if (skb) {
 767                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 768
 769                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 770                                             ireq->rmt_addr,
 771                                             ireq->opt);
 772                 err = net_xmit_eval(err);
 773         }
 774
 775         dst_release(dst);
 776         return err;
 777 }
 778
 779 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 780                               struct request_values *rvp)
 781 {
 782         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 783         return tcp_v4_send_synack(sk, NULL, req, rvp);
 784 }
 785
 786 /*
 787  *      IPv4 request_sock destructor.
 788  */
 789 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 790 {
 791         kfree(inet_rsk(req)->opt);
 792 }
 793
 794 #ifdef CONFIG_SYN_COOKIES
 795 static void syn_flood_warning(struct sk_buff *skb)
 796 {
 797         static unsigned long warntime;
 798
 799         if (time_after(jiffies, (warntime + HZ * 60))) {
 800                 warntime = jiffies;
 801                 printk(KERN_INFO
 802                        "possible SYN flooding on port %d. Sending cookies.\n",
 803                        ntohs(tcp_hdr(skb)->dest));
 804         }
 805 }
 806 #endif
 807
 808 /*
 809  * Save and compile IPv4 options into the request_sock if needed.
 810  */
 811 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 812                                               struct sk_buff *skb)
 813 {
 814         struct ip_options *opt = &(IPCB(skb)->opt);
 815         struct ip_options *dopt = NULL;
 816
 817         if (opt && opt->optlen) {
 818                 int opt_size = optlength(opt);
 819                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 820                 if (dopt) {
 821                         if (ip_options_echo(dopt, skb)) {
 822                                 kfree(dopt);
 823                                 dopt = NULL;
 824                         }
 825                 }
 826         }
 827         return dopt;
 828 }
 829
 830 #ifdef CONFIG_TCP_MD5SIG
 831 /*
 832  * RFC2385 MD5 checksumming requires a mapping of
 833  * IP address->MD5 Key.
 834  * We need to maintain these in the sk structure.
 835  */
 836
 837 /* Find the Key structure for an address.  */
 838 static struct tcp_md5sig_key *
 839                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 840 {
 841         struct tcp_sock *tp = tcp_sk(sk);
 842         int i;
 843
 844         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 845                 return NULL;
 846         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 847                 if (tp->md5sig_info->keys4[i].addr == addr)
 848                         return &tp->md5sig_info->keys4[i].base;
 849         }
 850         return NULL;
 851 }
 852
 853 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 854                                          struct sock *addr_sk)
 855 {
 856         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 857 }
 858
 859 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 860
 861 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 862                                                       struct request_sock *req)
 863 {
 864         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 865 }
 866
 867 /* This can be called on a newly created socket, from other files */
 868 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 869                       u8 *newkey, u8 newkeylen)
 870 {
 871         /* Add Key to the list */
 872         struct tcp_md5sig_key *key;
 873         struct tcp_sock *tp = tcp_sk(sk);
 874         struct tcp4_md5sig_key *keys;
 875
 876         key = tcp_v4_md5_do_lookup(sk, addr);
 877         if (key) {
 878                 /* Pre-existing entry - just update that one. */
 879                 kfree(key->key);
 880                 key->key = newkey;
 881                 key->keylen = newkeylen;
 882         } else {
 883                 struct tcp_md5sig_info *md5sig;
 884
 885                 if (!tp->md5sig_info) {
 886                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 887                                                   GFP_ATOMIC);
 888                         if (!tp->md5sig_info) {
 889                                 kfree(newkey);
 890                                 return -ENOMEM;
 891                         }
 892                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 893                 }
 894                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 895                         kfree(newkey);
 896                         return -ENOMEM;
 897                 }
 898                 md5sig = tp->md5sig_info;
 899
 900                 if (md5sig->alloced4 == md5sig->entries4) {
 901                         keys = kmalloc((sizeof(*keys) *
 902                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 903                         if (!keys) {
 904                                 kfree(newkey);
 905                                 tcp_free_md5sig_pool();
 906                                 return -ENOMEM;
 907                         }
 908
 909                         if (md5sig->entries4)
 910                                 memcpy(keys, md5sig->keys4,
 911                                        sizeof(*keys) * md5sig->entries4);
 912
 913                         /* Free old key list, and reference new one */
 914                         kfree(md5sig->keys4);
 915                         md5sig->keys4 = keys;
 916                         md5sig->alloced4++;
 917                 }
 918                 md5sig->entries4++;
 919                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 920                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 921                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 922         }
 923         return 0;
 924 }
 925
 926 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 927
 928 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 929                                u8 *newkey, u8 newkeylen)
 930 {
 931         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 932                                  newkey, newkeylen);
 933 }
 934
 935 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 936 {
 937         struct tcp_sock *tp = tcp_sk(sk);
 938         int i;
 939
 940         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 941                 if (tp->md5sig_info->keys4[i].addr == addr) {
 942                         /* Free the key */
 943                         kfree(tp->md5sig_info->keys4[i].base.key);
 944                         tp->md5sig_info->entries4--;
 945
 946                         if (tp->md5sig_info->entries4 == 0) {
 947                                 kfree(tp->md5sig_info->keys4);
 948                                 tp->md5sig_info->keys4 = NULL;
 949                                 tp->md5sig_info->alloced4 = 0;
 950                         } else if (tp->md5sig_info->entries4 != i) {
 951                                 /* Need to do some manipulation */
 952                                 memmove(&tp->md5sig_info->keys4[i],
 953                                         &tp->md5sig_info->keys4[i+1],
 954                                         (tp->md5sig_info->entries4 - i) *
 955                                          sizeof(struct tcp4_md5sig_key));
 956                         }
 957                         tcp_free_md5sig_pool();
 958                         return 0;
 959                 }
 960         }
 961         return -ENOENT;
 962 }
 963
 964 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 965
 966 static void tcp_v4_clear_md5_list(struct sock *sk)
 967 {
 968         struct tcp_sock *tp = tcp_sk(sk);
 969
 970         /* Free each key, then the set of key keys,
 971          * the crypto element, and then decrement our
 972          * hold on the last resort crypto.
 973          */
 974         if (tp->md5sig_info->entries4) {
 975                 int i;
 976                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 977                         kfree(tp->md5sig_info->keys4[i].base.key);
 978                 tp->md5sig_info->entries4 = 0;
 979                 tcp_free_md5sig_pool();
 980         }
 981         if (tp->md5sig_info->keys4) {
 982                 kfree(tp->md5sig_info->keys4);
 983                 tp->md5sig_info->keys4 = NULL;
 984                 tp->md5sig_info->alloced4  = 0;
 985         }
 986 }
 987
 988 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 989                                  int optlen)
 990 {
 991         struct tcp_md5sig cmd;
 992         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 993         u8 *newkey;
 994
 995         if (optlen < sizeof(cmd))
 996                 return -EINVAL;
 997
 998         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 999                 return -EFAULT;
1000
1001         if (sin->sin_family != AF_INET)
1002                 return -EINVAL;
1003
1004         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1005                 if (!tcp_sk(sk)->md5sig_info)
1006                         return -ENOENT;
1007                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1008         }
1009
1010         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1011                 return -EINVAL;
1012
1013         if (!tcp_sk(sk)->md5sig_info) {
1014                 struct tcp_sock *tp = tcp_sk(sk);
1015                 struct tcp_md5sig_info *p;
1016
1017                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1018                 if (!p)
1019                         return -EINVAL;
1020
1021                 tp->md5sig_info = p;
1022                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1023         }
1024
1025         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1026         if (!newkey)
1027                 return -ENOMEM;
1028         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1029                                  newkey, cmd.tcpm_keylen);
1030 }
1031
1032 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1033                                         __be32 daddr, __be32 saddr, int nbytes)
1034 {
1035         struct tcp4_pseudohdr *bp;
1036         struct scatterlist sg;
1037
1038         bp = &hp->md5_blk.ip4;
1039
1040         /*
1041          * 1. the TCP pseudo-header (in the order: source IP address,
1042          * destination IP address, zero-padded protocol number, and
1043          * segment length)
1044          */
1045         bp->saddr = saddr;
1046         bp->daddr = daddr;
1047         bp->pad = 0;
1048         bp->protocol = IPPROTO_TCP;
1049         bp->len = cpu_to_be16(nbytes);
1050
1051         sg_init_one(&sg, bp, sizeof(*bp));
1052         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1053 }
1054
1055 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1056                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1057 {
1058         struct tcp_md5sig_pool *hp;
1059         struct hash_desc *desc;
1060
1061         hp = tcp_get_md5sig_pool();
1062         if (!hp)
1063                 goto clear_hash_noput;
1064         desc = &hp->md5_desc;
1065
1066         if (crypto_hash_init(desc))
1067                 goto clear_hash;
1068         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1069                 goto clear_hash;
1070         if (tcp_md5_hash_header(hp, th))
1071                 goto clear_hash;
1072         if (tcp_md5_hash_key(hp, key))
1073                 goto clear_hash;
1074         if (crypto_hash_final(desc, md5_hash))
1075                 goto clear_hash;
1076
1077         tcp_put_md5sig_pool();
1078         return 0;
1079
1080 clear_hash:
1081         tcp_put_md5sig_pool();
1082 clear_hash_noput:
1083         memset(md5_hash, 0, 16);
1084         return 1;
1085 }
1086
1087 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1088                         struct sock *sk, struct request_sock *req,
1089                         struct sk_buff *skb)
1090 {
1091         struct tcp_md5sig_pool *hp;
1092         struct hash_desc *desc;
1093         struct tcphdr *th = tcp_hdr(skb);
1094         __be32 saddr, daddr;
1095
1096         if (sk) {
1097                 saddr = inet_sk(sk)->inet_saddr;
1098                 daddr = inet_sk(sk)->inet_daddr;
1099         } else if (req) {
1100                 saddr = inet_rsk(req)->loc_addr;
1101                 daddr = inet_rsk(req)->rmt_addr;
1102         } else {
1103                 const struct iphdr *iph = ip_hdr(skb);
1104                 saddr = iph->saddr;
1105                 daddr = iph->daddr;
1106         }
1107
1108         hp = tcp_get_md5sig_pool();
1109         if (!hp)
1110                 goto clear_hash_noput;
1111         desc = &hp->md5_desc;
1112
1113         if (crypto_hash_init(desc))
1114                 goto clear_hash;
1115
1116         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_header(hp, th))
1119                 goto clear_hash;
1120         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1121                 goto clear_hash;
1122         if (tcp_md5_hash_key(hp, key))
1123                 goto clear_hash;
1124         if (crypto_hash_final(desc, md5_hash))
1125                 goto clear_hash;
1126
1127         tcp_put_md5sig_pool();
1128         return 0;
1129
1130 clear_hash:
1131         tcp_put_md5sig_pool();
1132 clear_hash_noput:
1133         memset(md5_hash, 0, 16);
1134         return 1;
1135 }
1136
1137 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1138
1139 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1140 {
1141         /*
1142          * This gets called for each TCP segment that arrives
1143          * so we want to be efficient.
1144          * We have 3 drop cases:
1145          * o No MD5 hash and one expected.
1146          * o MD5 hash and we're not expecting one.
1147          * o MD5 hash and its wrong.
1148          */
1149         __u8 *hash_location = NULL;
1150         struct tcp_md5sig_key *hash_expected;
1151         const struct iphdr *iph = ip_hdr(skb);
1152         struct tcphdr *th = tcp_hdr(skb);
1153         int genhash;
1154         unsigned char newhash[16];
1155
1156         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1157         hash_location = tcp_parse_md5sig_option(th);
1158
1159         /* We've parsed the options - do we have a hash? */
1160         if (!hash_expected && !hash_location)
1161                 return 0;
1162
1163         if (hash_expected && !hash_location) {
1164                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165                 return 1;
1166         }
1167
1168         if (!hash_expected && hash_location) {
1169                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170                 return 1;
1171         }
1172
1173         /* Okay, so this is hash_expected and hash_location -
1174          * so we need to calculate the checksum.
1175          */
1176         genhash = tcp_v4_md5_hash_skb(newhash,
1177                                       hash_expected,
1178                                       NULL, NULL, skb);
1179
1180         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1181                 if (net_ratelimit()) {
1182                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1183                                &iph->saddr, ntohs(th->source),
1184                                &iph->daddr, ntohs(th->dest),
1185                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1186                 }
1187                 return 1;
1188         }
1189         return 0;
1190 }
1191
1192 #endif
1193
1194 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1195         .family         =       PF_INET,
1196         .obj_size       =       sizeof(struct tcp_request_sock),
1197         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1198         .send_ack       =       tcp_v4_reqsk_send_ack,
1199         .destructor     =       tcp_v4_reqsk_destructor,
1200         .send_reset     =       tcp_v4_send_reset,
1201         .syn_ack_timeout =      tcp_syn_ack_timeout,
1202 };
1203
1204 #ifdef CONFIG_TCP_MD5SIG
1205 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1206         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1207         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1208 };
1209 #endif
1210
1211 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1212         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1213         .twsk_unique    = tcp_twsk_unique,
1214         .twsk_destructor= tcp_twsk_destructor,
1215 };
1216
1217 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1218 {
1219         struct tcp_extend_values tmp_ext;
1220         struct tcp_options_received tmp_opt;
1221         u8 *hash_location;
1222         struct request_sock *req;
1223         struct inet_request_sock *ireq;
1224         struct tcp_sock *tp = tcp_sk(sk);
1225         struct dst_entry *dst = NULL;
1226         __be32 saddr = ip_hdr(skb)->saddr;
1227         __be32 daddr = ip_hdr(skb)->daddr;
1228         __u32 isn = TCP_SKB_CB(skb)->when;
1229 #ifdef CONFIG_SYN_COOKIES
1230         int want_cookie = 0;
1231 #else
1232 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1233 #endif
1234
1235         /* Never answer to SYNs send to broadcast or multicast */
1236         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1237                 goto drop;
1238
1239         /* TW buckets are converted to open requests without
1240          * limitations, they conserve resources and peer is
1241          * evidently real one.
1242          */
1243         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1244 #ifdef CONFIG_SYN_COOKIES
1245                 if (sysctl_tcp_syncookies) {
1246                         want_cookie = 1;
1247                 } else
1248 #endif
1249                 goto drop;
1250         }
1251
1252         /* Accept backlog is full. If we have already queued enough
1253          * of warm entries in syn queue, drop request. It is better than
1254          * clogging syn queue with openreqs with exponentially increasing
1255          * timeout.
1256          */
1257         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1258                 goto drop;
1259
1260         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1261         if (!req)
1262                 goto drop;
1263
1264 #ifdef CONFIG_TCP_MD5SIG
1265         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1266 #endif
1267
1268         tcp_clear_options(&tmp_opt);
1269         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1270         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1271         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1272
1273         if (tmp_opt.cookie_plus > 0 &&
1274             tmp_opt.saw_tstamp &&
1275             !tp->rx_opt.cookie_out_never &&
1276             (sysctl_tcp_cookie_size > 0 ||
1277              (tp->cookie_values != NULL &&
1278               tp->cookie_values->cookie_desired > 0))) {
1279                 u8 *c;
1280                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1281                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1282
1283                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1284                         goto drop_and_release;
1285
1286                 /* Secret recipe starts with IP addresses */
1287                 *mess++ ^= (__force u32)daddr;
1288                 *mess++ ^= (__force u32)saddr;
1289
1290                 /* plus variable length Initiator Cookie */
1291                 c = (u8 *)mess;
1292                 while (l-- > 0)
1293                         *c++ ^= *hash_location++;
1294
1295 #ifdef CONFIG_SYN_COOKIES
1296                 want_cookie = 0;        /* not our kind of cookie */
1297 #endif
1298                 tmp_ext.cookie_out_never = 0; /* false */
1299                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1300         } else if (!tp->rx_opt.cookie_in_always) {
1301                 /* redundant indications, but ensure initialization. */
1302                 tmp_ext.cookie_out_never = 1; /* true */
1303                 tmp_ext.cookie_plus = 0;
1304         } else {
1305                 goto drop_and_release;
1306         }
1307         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1308
1309         if (want_cookie && !tmp_opt.saw_tstamp)
1310                 tcp_clear_options(&tmp_opt);
1311
1312         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1313         tcp_openreq_init(req, &tmp_opt, skb);
1314
1315         ireq = inet_rsk(req);
1316         ireq->loc_addr = daddr;
1317         ireq->rmt_addr = saddr;
1318         ireq->no_srccheck = inet_sk(sk)->transparent;
1319         ireq->opt = tcp_v4_save_options(sk, skb);
1320
1321         if (security_inet_conn_request(sk, skb, req))
1322                 goto drop_and_free;
1323
1324         if (!want_cookie)
1325                 TCP_ECN_create_request(req, tcp_hdr(skb));
1326
1327         if (want_cookie) {
1328 #ifdef CONFIG_SYN_COOKIES
1329                 syn_flood_warning(skb);
1330                 req->cookie_ts = tmp_opt.tstamp_ok;
1331 #endif
1332                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333         } else if (!isn) {
1334                 struct inet_peer *peer = NULL;
1335
1336                 /* VJ's idea. We save last timestamp seen
1337                  * from the destination in peer table, when entering
1338                  * state TIME-WAIT, and check against it before
1339                  * accepting new connection request.
1340                  *
1341                  * If "isn" is not zero, this request hit alive
1342                  * timewait bucket, so that all the necessary checks
1343                  * are made in the function processing timewait state.
1344                  */
1345                 if (tmp_opt.saw_tstamp &&
1346                     tcp_death_row.sysctl_tw_recycle &&
1347                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1348                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1349                     peer->v4daddr == saddr) {
1350                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1351                             (s32)(peer->tcp_ts - req->ts_recent) >
1352                                                         TCP_PAWS_WINDOW) {
1353                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1354                                 goto drop_and_release;
1355                         }
1356                 }
1357                 /* Kill the following clause, if you dislike this way. */
1358                 else if (!sysctl_tcp_syncookies &&
1359                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1360                           (sysctl_max_syn_backlog >> 2)) &&
1361                          (!peer || !peer->tcp_ts_stamp) &&
1362                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1363                         /* Without syncookies last quarter of
1364                          * backlog is filled with destinations,
1365                          * proven to be alive.
1366                          * It means that we continue to communicate
1367                          * to destinations, already remembered
1368                          * to the moment of synflood.
1369                          */
1370                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1371                                        &saddr, ntohs(tcp_hdr(skb)->source));
1372                         goto drop_and_release;
1373                 }
1374
1375                 isn = tcp_v4_init_sequence(skb);
1376         }
1377         tcp_rsk(req)->snt_isn = isn;
1378
1379         if (tcp_v4_send_synack(sk, dst, req,
1380                                (struct request_values *)&tmp_ext) ||
1381             want_cookie)
1382                 goto drop_and_free;
1383
1384         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1385         return 0;
1386
1387 drop_and_release:
1388         dst_release(dst);
1389 drop_and_free:
1390         reqsk_free(req);
1391 drop:
1392         return 0;
1393 }
1394
1395
1396 /*
1397  * The three way handshake has completed - we got a valid synack -
1398  * now create the new socket.
1399  */
1400 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1401                                   struct request_sock *req,
1402                                   struct dst_entry *dst)
1403 {
1404         struct inet_request_sock *ireq;
1405         struct inet_sock *newinet;
1406         struct tcp_sock *newtp;
1407         struct sock *newsk;
1408 #ifdef CONFIG_TCP_MD5SIG
1409         struct tcp_md5sig_key *key;
1410 #endif
1411
1412         if (sk_acceptq_is_full(sk))
1413                 goto exit_overflow;
1414
1415         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1416                 goto exit;
1417
1418         newsk = tcp_create_openreq_child(sk, req, skb);
1419         if (!newsk)
1420                 goto exit;
1421
1422         newsk->sk_gso_type = SKB_GSO_TCPV4;
1423         sk_setup_caps(newsk, dst);
1424
1425         newtp                 = tcp_sk(newsk);
1426         newinet               = inet_sk(newsk);
1427         ireq                  = inet_rsk(req);
1428         newinet->inet_daddr   = ireq->rmt_addr;
1429         newinet->inet_rcv_saddr = ireq->loc_addr;
1430         newinet->inet_saddr           = ireq->loc_addr;
1431         newinet->opt          = ireq->opt;
1432         ireq->opt             = NULL;
1433         newinet->mc_index     = inet_iif(skb);
1434         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1435         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436         if (newinet->opt)
1437                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1438         newinet->inet_id = newtp->write_seq ^ jiffies;
1439
1440         tcp_mtup_init(newsk);
1441         tcp_sync_mss(newsk, dst_mtu(dst));
1442         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1443         if (tcp_sk(sk)->rx_opt.user_mss &&
1444             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1445                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1446
1447         tcp_initialize_rcv_mss(newsk);
1448
1449 #ifdef CONFIG_TCP_MD5SIG
1450         /* Copy over the MD5 key from the original socket */
1451         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1452         if (key != NULL) {
1453                 /*
1454                  * We're using one, so create a matching key
1455                  * on the newsk structure. If we fail to get
1456                  * memory, then we end up not copying the key
1457                  * across. Shucks.
1458                  */
1459                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1460                 if (newkey != NULL)
1461                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1462                                           newkey, key->keylen);
1463                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1464         }
1465 #endif
1466
1467         __inet_hash_nolisten(newsk, NULL);
1468         __inet_inherit_port(sk, newsk);
1469
1470         return newsk;
1471
1472 exit_overflow:
1473         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1474 exit:
1475         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1476         dst_release(dst);
1477         return NULL;
1478 }
1479
1480 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1481 {
1482         struct tcphdr *th = tcp_hdr(skb);
1483         const struct iphdr *iph = ip_hdr(skb);
1484         struct sock *nsk;
1485         struct request_sock **prev;
1486         /* Find possible connection requests. */
1487         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1488                                                        iph->saddr, iph->daddr);
1489         if (req)
1490                 return tcp_check_req(sk, skb, req, prev);
1491
1492         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1493                         th->source, iph->daddr, th->dest, inet_iif(skb));
1494
1495         if (nsk) {
1496                 if (nsk->sk_state != TCP_TIME_WAIT) {
1497                         bh_lock_sock(nsk);
1498                         return nsk;
1499                 }
1500                 inet_twsk_put(inet_twsk(nsk));
1501                 return NULL;
1502         }
1503
1504 #ifdef CONFIG_SYN_COOKIES
1505         if (!th->rst && !th->syn && th->ack)
1506                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1507 #endif
1508         return sk;
1509 }
1510
1511 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1512 {
1513         const struct iphdr *iph = ip_hdr(skb);
1514
1515         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1516                 if (!tcp_v4_check(skb->len, iph->saddr,
1517                                   iph->daddr, skb->csum)) {
1518                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1519                         return 0;
1520                 }
1521         }
1522
1523         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1524                                        skb->len, IPPROTO_TCP, 0);
1525
1526         if (skb->len <= 76) {
1527                 return __skb_checksum_complete(skb);
1528         }
1529         return 0;
1530 }
1531
1532
1533 /* The socket must have it's spinlock held when we get
1534  * here.
1535  *
1536  * We have a potential double-lock case here, so even when
1537  * doing backlog processing we use the BH locking scheme.
1538  * This is because we cannot sleep with the original spinlock
1539  * held.
1540  */
1541 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1542 {
1543         struct sock *rsk;
1544 #ifdef CONFIG_TCP_MD5SIG
1545         /*
1546          * We really want to reject the packet as early as possible
1547          * if:
1548          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1549          *  o There is an MD5 option and we're not expecting one
1550          */
1551         if (tcp_v4_inbound_md5_hash(sk, skb))
1552                 goto discard;
1553 #endif
1554
1555         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1556                 sock_rps_save_rxhash(sk, skb->rxhash);
1557                 TCP_CHECK_TIMER(sk);
1558                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1559                         rsk = sk;
1560                         goto reset;
1561                 }
1562                 TCP_CHECK_TIMER(sk);
1563                 return 0;
1564         }
1565
1566         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1567                 goto csum_err;
1568
1569         if (sk->sk_state == TCP_LISTEN) {
1570                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1571                 if (!nsk)
1572                         goto discard;
1573
1574                 if (nsk != sk) {
1575                         if (tcp_child_process(sk, nsk, skb)) {
1576                                 rsk = nsk;
1577                                 goto reset;
1578                         }
1579                         return 0;
1580                 }
1581         } else
1582                 sock_rps_save_rxhash(sk, skb->rxhash);
1583
1584
1585         TCP_CHECK_TIMER(sk);
1586         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1587                 rsk = sk;
1588                 goto reset;
1589         }
1590         TCP_CHECK_TIMER(sk);
1591         return 0;
1592
1593 reset:
1594         tcp_v4_send_reset(rsk, skb);
1595 discard:
1596         kfree_skb(skb);
1597         /* Be careful here. If this function gets more complicated and
1598          * gcc suffers from register pressure on the x86, sk (in %ebx)
1599          * might be destroyed here. This current version compiles correctly,
1600          * but you have been warned.
1601          */
1602         return 0;
1603
1604 csum_err:
1605         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1606         goto discard;
1607 }
1608
1609 /*
1610  *      From tcp_input.c
1611  */
1612
1613 int tcp_v4_rcv(struct sk_buff *skb)
1614 {
1615         const struct iphdr *iph;
1616         struct tcphdr *th;
1617         struct sock *sk;
1618         int ret;
1619         struct net *net = dev_net(skb->dev);
1620
1621         if (skb->pkt_type != PACKET_HOST)
1622                 goto discard_it;
1623
1624         /* Count it even if it's bad */
1625         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1626
1627         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1628                 goto discard_it;
1629
1630         th = tcp_hdr(skb);
1631
1632         if (th->doff < sizeof(struct tcphdr) / 4)
1633                 goto bad_packet;
1634         if (!pskb_may_pull(skb, th->doff * 4))
1635                 goto discard_it;
1636
1637         /* An explanation is required here, I think.
1638          * Packet length and doff are validated by header prediction,
1639          * provided case of th->doff==0 is eliminated.
1640          * So, we defer the checks. */
1641         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1642                 goto bad_packet;
1643
1644         th = tcp_hdr(skb);
1645         iph = ip_hdr(skb);
1646         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1647         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1648                                     skb->len - th->doff * 4);
1649         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1650         TCP_SKB_CB(skb)->when    = 0;
1651         TCP_SKB_CB(skb)->flags   = iph->tos;
1652         TCP_SKB_CB(skb)->sacked  = 0;
1653
1654         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1655         if (!sk)
1656                 goto no_tcp_socket;
1657
1658 process:
1659         if (sk->sk_state == TCP_TIME_WAIT)
1660                 goto do_time_wait;
1661
1662         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1663                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1664                 goto discard_and_relse;
1665         }
1666
1667         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1668                 goto discard_and_relse;
1669         nf_reset(skb);
1670
1671         if (sk_filter(sk, skb))
1672                 goto discard_and_relse;
1673
1674         skb->dev = NULL;
1675
1676         bh_lock_sock_nested(sk);
1677         ret = 0;
1678         if (!sock_owned_by_user(sk)) {
1679 #ifdef CONFIG_NET_DMA
1680                 struct tcp_sock *tp = tcp_sk(sk);
1681                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1682                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1683                 if (tp->ucopy.dma_chan)
1684                         ret = tcp_v4_do_rcv(sk, skb);
1685                 else
1686 #endif
1687                 {
1688                         if (!tcp_prequeue(sk, skb))
1689                                 ret = tcp_v4_do_rcv(sk, skb);
1690                 }
1691         } else if (unlikely(sk_add_backlog(sk, skb))) {
1692                 bh_unlock_sock(sk);
1693                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1694                 goto discard_and_relse;
1695         }
1696         bh_unlock_sock(sk);
1697
1698         sock_put(sk);
1699
1700         return ret;
1701
1702 no_tcp_socket:
1703         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1704                 goto discard_it;
1705
1706         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1707 bad_packet:
1708                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1709         } else {
1710                 tcp_v4_send_reset(NULL, skb);
1711         }
1712
1713 discard_it:
1714         /* Discard frame. */
1715         kfree_skb(skb);
1716         return 0;
1717
1718 discard_and_relse:
1719         sock_put(sk);
1720         goto discard_it;
1721
1722 do_time_wait:
1723         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1724                 inet_twsk_put(inet_twsk(sk));
1725                 goto discard_it;
1726         }
1727
1728         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1729                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1730                 inet_twsk_put(inet_twsk(sk));
1731                 goto discard_it;
1732         }
1733         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1734         case TCP_TW_SYN: {
1735                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1736                                                         &tcp_hashinfo,
1737                                                         iph->daddr, th->dest,
1738                                                         inet_iif(skb));
1739                 if (sk2) {
1740                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1741                         inet_twsk_put(inet_twsk(sk));
1742                         sk = sk2;
1743                         goto process;
1744                 }
1745                 /* Fall through to ACK */
1746         }
1747         case TCP_TW_ACK:
1748                 tcp_v4_timewait_ack(sk, skb);
1749                 break;
1750         case TCP_TW_RST:
1751                 goto no_tcp_socket;
1752         case TCP_TW_SUCCESS:;
1753         }
1754         goto discard_it;
1755 }
1756
1757 /* VJ's idea. Save last timestamp seen from this destination
1758  * and hold it at least for normal timewait interval to use for duplicate
1759  * segment detection in subsequent connections, before they enter synchronized
1760  * state.
1761  */
1762
1763 int tcp_v4_remember_stamp(struct sock *sk)
1764 {
1765         struct inet_sock *inet = inet_sk(sk);
1766         struct tcp_sock *tp = tcp_sk(sk);
1767         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1768         struct inet_peer *peer = NULL;
1769         int release_it = 0;
1770
1771         if (!rt || rt->rt_dst != inet->inet_daddr) {
1772                 peer = inet_getpeer(inet->inet_daddr, 1);
1773                 release_it = 1;
1774         } else {
1775                 if (!rt->peer)
1776                         rt_bind_peer(rt, 1);
1777                 peer = rt->peer;
1778         }
1779
1780         if (peer) {
1781                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1782                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1783                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1784                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1785                         peer->tcp_ts = tp->rx_opt.ts_recent;
1786                 }
1787                 if (release_it)
1788                         inet_putpeer(peer);
1789                 return 1;
1790         }
1791
1792         return 0;
1793 }
1794
1795 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1796 {
1797         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1798
1799         if (peer) {
1800                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1801
1802                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1803                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1804                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1805                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1806                         peer->tcp_ts       = tcptw->tw_ts_recent;
1807                 }
1808                 inet_putpeer(peer);
1809                 return 1;
1810         }
1811
1812         return 0;
1813 }
1814
1815 const struct inet_connection_sock_af_ops ipv4_specific = {
1816         .queue_xmit        = ip_queue_xmit,
1817         .send_check        = tcp_v4_send_check,
1818         .rebuild_header    = inet_sk_rebuild_header,
1819         .conn_request      = tcp_v4_conn_request,
1820         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1821         .remember_stamp    = tcp_v4_remember_stamp,
1822         .net_header_len    = sizeof(struct iphdr),
1823         .setsockopt        = ip_setsockopt,
1824         .getsockopt        = ip_getsockopt,
1825         .addr2sockaddr     = inet_csk_addr2sockaddr,
1826         .sockaddr_len      = sizeof(struct sockaddr_in),
1827         .bind_conflict     = inet_csk_bind_conflict,
1828 #ifdef CONFIG_COMPAT
1829         .compat_setsockopt = compat_ip_setsockopt,
1830         .compat_getsockopt = compat_ip_getsockopt,
1831 #endif
1832 };
1833
1834 #ifdef CONFIG_TCP_MD5SIG
1835 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1836         .md5_lookup             = tcp_v4_md5_lookup,
1837         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1838         .md5_add                = tcp_v4_md5_add_func,
1839         .md5_parse              = tcp_v4_parse_md5_keys,
1840 };
1841 #endif
1842
1843 /* NOTE: A lot of things set to zero explicitly by call to
1844  *       sk_alloc() so need not be done here.
1845  */
1846 static int tcp_v4_init_sock(struct sock *sk)
1847 {
1848         struct inet_connection_sock *icsk = inet_csk(sk);
1849         struct tcp_sock *tp = tcp_sk(sk);
1850
1851         skb_queue_head_init(&tp->out_of_order_queue);
1852         tcp_init_xmit_timers(sk);
1853         tcp_prequeue_init(tp);
1854
1855         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1856         tp->mdev = TCP_TIMEOUT_INIT;
1857
1858         /* So many TCP implementations out there (incorrectly) count the
1859          * initial SYN frame in their delayed-ACK and congestion control
1860          * algorithms that we must have the following bandaid to talk
1861          * efficiently to them.  -DaveM
1862          */
1863         tp->snd_cwnd = 2;
1864
1865         /* See draft-stevens-tcpca-spec-01 for discussion of the
1866          * initialization of these values.
1867          */
1868         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1869         tp->snd_cwnd_clamp = ~0;
1870         tp->mss_cache = TCP_MSS_DEFAULT;
1871
1872         tp->reordering = sysctl_tcp_reordering;
1873         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1874
1875         sk->sk_state = TCP_CLOSE;
1876
1877         sk->sk_write_space = sk_stream_write_space;
1878         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1879
1880         icsk->icsk_af_ops = &ipv4_specific;
1881         icsk->icsk_sync_mss = tcp_sync_mss;
1882 #ifdef CONFIG_TCP_MD5SIG
1883         tp->af_specific = &tcp_sock_ipv4_specific;
1884 #endif
1885
1886         /* TCP Cookie Transactions */
1887         if (sysctl_tcp_cookie_size > 0) {
1888                 /* Default, cookies without s_data_payload. */
1889                 tp->cookie_values =
1890                         kzalloc(sizeof(*tp->cookie_values),
1891                                 sk->sk_allocation);
1892                 if (tp->cookie_values != NULL)
1893                         kref_init(&tp->cookie_values->kref);
1894         }
1895         /* Presumed zeroed, in order of appearance:
1896          *      cookie_in_always, cookie_out_never,
1897          *      s_data_constant, s_data_in, s_data_out
1898          */
1899         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1900         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1901
1902         local_bh_disable();
1903         percpu_counter_inc(&tcp_sockets_allocated);
1904         local_bh_enable();
1905
1906         return 0;
1907 }
1908
1909 void tcp_v4_destroy_sock(struct sock *sk)
1910 {
1911         struct tcp_sock *tp = tcp_sk(sk);
1912
1913         tcp_clear_xmit_timers(sk);
1914
1915         tcp_cleanup_congestion_control(sk);
1916
1917         /* Cleanup up the write buffer. */
1918         tcp_write_queue_purge(sk);
1919
1920         /* Cleans up our, hopefully empty, out_of_order_queue. */
1921         __skb_queue_purge(&tp->out_of_order_queue);
1922
1923 #ifdef CONFIG_TCP_MD5SIG
1924         /* Clean up the MD5 key list, if any */
1925         if (tp->md5sig_info) {
1926                 tcp_v4_clear_md5_list(sk);
1927                 kfree(tp->md5sig_info);
1928                 tp->md5sig_info = NULL;
1929         }
1930 #endif
1931
1932 #ifdef CONFIG_NET_DMA
1933         /* Cleans up our sk_async_wait_queue */
1934         __skb_queue_purge(&sk->sk_async_wait_queue);
1935 #endif
1936
1937         /* Clean prequeue, it must be empty really */
1938         __skb_queue_purge(&tp->ucopy.prequeue);
1939
1940         /* Clean up a referenced TCP bind bucket. */
1941         if (inet_csk(sk)->icsk_bind_hash)
1942                 inet_put_port(sk);
1943
1944         /*
1945          * If sendmsg cached page exists, toss it.
1946          */
1947         if (sk->sk_sndmsg_page) {
1948                 __free_page(sk->sk_sndmsg_page);
1949                 sk->sk_sndmsg_page = NULL;
1950         }
1951
1952         /* TCP Cookie Transactions */
1953         if (tp->cookie_values != NULL) {
1954                 kref_put(&tp->cookie_values->kref,
1955                          tcp_cookie_values_release);
1956                 tp->cookie_values = NULL;
1957         }
1958
1959         percpu_counter_dec(&tcp_sockets_allocated);
1960 }
1961
1962 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1963
1964 #ifdef CONFIG_PROC_FS
1965 /* Proc filesystem TCP sock list dumping. */
1966
1967 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1968 {
1969         return hlist_nulls_empty(head) ? NULL :
1970                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1971 }
1972
1973 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1974 {
1975         return !is_a_nulls(tw->tw_node.next) ?
1976                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1977 }
1978
1979 static void *listening_get_next(struct seq_file *seq, void *cur)
1980 {
1981         struct inet_connection_sock *icsk;
1982         struct hlist_nulls_node *node;
1983         struct sock *sk = cur;
1984         struct inet_listen_hashbucket *ilb;
1985         struct tcp_iter_state *st = seq->private;
1986         struct net *net = seq_file_net(seq);
1987
1988         if (!sk) {
1989                 st->bucket = 0;
1990                 ilb = &tcp_hashinfo.listening_hash[0];
1991                 spin_lock_bh(&ilb->lock);
1992                 sk = sk_nulls_head(&ilb->head);
1993                 goto get_sk;
1994         }
1995         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1996         ++st->num;
1997
1998         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1999                 struct request_sock *req = cur;
2000
2001                 icsk = inet_csk(st->syn_wait_sk);
2002                 req = req->dl_next;
2003                 while (1) {
2004                         while (req) {
2005                                 if (req->rsk_ops->family == st->family) {
2006                                         cur = req;
2007                                         goto out;
2008                                 }
2009                                 req = req->dl_next;
2010                         }
2011                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2012                                 break;
2013 get_req:
2014                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2015                 }
2016                 sk        = sk_next(st->syn_wait_sk);
2017                 st->state = TCP_SEQ_STATE_LISTENING;
2018                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2019         } else {
2020                 icsk = inet_csk(sk);
2021                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2023                         goto start_req;
2024                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2025                 sk = sk_next(sk);
2026         }
2027 get_sk:
2028         sk_nulls_for_each_from(sk, node) {
2029                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2030                         cur = sk;
2031                         goto out;
2032                 }
2033                 icsk = inet_csk(sk);
2034                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2036 start_req:
2037                         st->uid         = sock_i_uid(sk);
2038                         st->syn_wait_sk = sk;
2039                         st->state       = TCP_SEQ_STATE_OPENREQ;
2040                         st->sbucket     = 0;
2041                         goto get_req;
2042                 }
2043                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2044         }
2045         spin_unlock_bh(&ilb->lock);
2046         if (++st->bucket < INET_LHTABLE_SIZE) {
2047                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2048                 spin_lock_bh(&ilb->lock);
2049                 sk = sk_nulls_head(&ilb->head);
2050                 goto get_sk;
2051         }
2052         cur = NULL;
2053 out:
2054         return cur;
2055 }
2056
2057 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2058 {
2059         void *rc = listening_get_next(seq, NULL);
2060
2061         while (rc && *pos) {
2062                 rc = listening_get_next(seq, rc);
2063                 --*pos;
2064         }
2065         return rc;
2066 }
2067
2068 static inline int empty_bucket(struct tcp_iter_state *st)
2069 {
2070         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2071                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2072 }
2073
2074 static void *established_get_first(struct seq_file *seq)
2075 {
2076         struct tcp_iter_state *st = seq->private;
2077         struct net *net = seq_file_net(seq);
2078         void *rc = NULL;
2079
2080         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2081                 struct sock *sk;
2082                 struct hlist_nulls_node *node;
2083                 struct inet_timewait_sock *tw;
2084                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2085
2086                 /* Lockless fast path for the common case of empty buckets */
2087                 if (empty_bucket(st))
2088                         continue;
2089
2090                 spin_lock_bh(lock);
2091                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2092                         if (sk->sk_family != st->family ||
2093                             !net_eq(sock_net(sk), net)) {
2094                                 continue;
2095                         }
2096                         rc = sk;
2097                         goto out;
2098                 }
2099                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100                 inet_twsk_for_each(tw, node,
2101                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2102                         if (tw->tw_family != st->family ||
2103                             !net_eq(twsk_net(tw), net)) {
2104                                 continue;
2105                         }
2106                         rc = tw;
2107                         goto out;
2108                 }
2109                 spin_unlock_bh(lock);
2110                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2111         }
2112 out:
2113         return rc;
2114 }
2115
2116 static void *established_get_next(struct seq_file *seq, void *cur)
2117 {
2118         struct sock *sk = cur;
2119         struct inet_timewait_sock *tw;
2120         struct hlist_nulls_node *node;
2121         struct tcp_iter_state *st = seq->private;
2122         struct net *net = seq_file_net(seq);
2123
2124         ++st->num;
2125
2126         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2127                 tw = cur;
2128                 tw = tw_next(tw);
2129 get_tw:
2130                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2131                         tw = tw_next(tw);
2132                 }
2133                 if (tw) {
2134                         cur = tw;
2135                         goto out;
2136                 }
2137                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2139
2140                 /* Look for next non empty bucket */
2141                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2142                                 empty_bucket(st))
2143                         ;
2144                 if (st->bucket > tcp_hashinfo.ehash_mask)
2145                         return NULL;
2146
2147                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2148                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2149         } else
2150                 sk = sk_nulls_next(sk);
2151
2152         sk_nulls_for_each_from(sk, node) {
2153                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2154                         goto found;
2155         }
2156
2157         st->state = TCP_SEQ_STATE_TIME_WAIT;
2158         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2159         goto get_tw;
2160 found:
2161         cur = sk;
2162 out:
2163         return cur;
2164 }
2165
2166 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2167 {
2168         void *rc = established_get_first(seq);
2169
2170         while (rc && pos) {
2171                 rc = established_get_next(seq, rc);
2172                 --pos;
2173         }
2174         return rc;
2175 }
2176
2177 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2178 {
2179         void *rc;
2180         struct tcp_iter_state *st = seq->private;
2181
2182         st->state = TCP_SEQ_STATE_LISTENING;
2183         rc        = listening_get_idx(seq, &pos);
2184
2185         if (!rc) {
2186                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2187                 rc        = established_get_idx(seq, pos);
2188         }
2189
2190         return rc;
2191 }
2192
2193 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2194 {
2195         struct tcp_iter_state *st = seq->private;
2196         st->state = TCP_SEQ_STATE_LISTENING;
2197         st->num = 0;
2198         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2199 }
2200
2201 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2202 {
2203         void *rc = NULL;
2204         struct tcp_iter_state *st;
2205
2206         if (v == SEQ_START_TOKEN) {
2207                 rc = tcp_get_idx(seq, 0);
2208                 goto out;
2209         }
2210         st = seq->private;
2211
2212         switch (st->state) {
2213         case TCP_SEQ_STATE_OPENREQ:
2214         case TCP_SEQ_STATE_LISTENING:
2215                 rc = listening_get_next(seq, v);
2216                 if (!rc) {
2217                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2218                         rc        = established_get_first(seq);
2219                 }
2220                 break;
2221         case TCP_SEQ_STATE_ESTABLISHED:
2222         case TCP_SEQ_STATE_TIME_WAIT:
2223                 rc = established_get_next(seq, v);
2224                 break;
2225         }
2226 out:
2227         ++*pos;
2228         return rc;
2229 }
2230
2231 static void tcp_seq_stop(struct seq_file *seq, void *v)
2232 {
2233         struct tcp_iter_state *st = seq->private;
2234
2235         switch (st->state) {
2236         case TCP_SEQ_STATE_OPENREQ:
2237                 if (v) {
2238                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2239                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2240                 }
2241         case TCP_SEQ_STATE_LISTENING:
2242                 if (v != SEQ_START_TOKEN)
2243                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2244                 break;
2245         case TCP_SEQ_STATE_TIME_WAIT:
2246         case TCP_SEQ_STATE_ESTABLISHED:
2247                 if (v)
2248                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2249                 break;
2250         }
2251 }
2252
2253 static int tcp_seq_open(struct inode *inode, struct file *file)
2254 {
2255         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2256         struct tcp_iter_state *s;
2257         int err;
2258
2259         err = seq_open_net(inode, file, &afinfo->seq_ops,
2260                           sizeof(struct tcp_iter_state));
2261         if (err < 0)
2262                 return err;
2263
2264         s = ((struct seq_file *)file->private_data)->private;
2265         s->family               = afinfo->family;
2266         return 0;
2267 }
2268
2269 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2270 {
2271         int rc = 0;
2272         struct proc_dir_entry *p;
2273
2274         afinfo->seq_fops.open           = tcp_seq_open;
2275         afinfo->seq_fops.read           = seq_read;
2276         afinfo->seq_fops.llseek         = seq_lseek;
2277         afinfo->seq_fops.release        = seq_release_net;
2278
2279         afinfo->seq_ops.start           = tcp_seq_start;
2280         afinfo->seq_ops.next            = tcp_seq_next;
2281         afinfo->seq_ops.stop            = tcp_seq_stop;
2282
2283         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2284                              &afinfo->seq_fops, afinfo);
2285         if (!p)
2286                 rc = -ENOMEM;
2287         return rc;
2288 }
2289
2290 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2291 {
2292         proc_net_remove(net, afinfo->name);
2293 }
2294
2295 static void get_openreq4(struct sock *sk, struct request_sock *req,
2296                          struct seq_file *f, int i, int uid, int *len)
2297 {
2298         const struct inet_request_sock *ireq = inet_rsk(req);
2299         int ttd = req->expires - jiffies;
2300
2301         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2302                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2303                 i,
2304                 ireq->loc_addr,
2305                 ntohs(inet_sk(sk)->inet_sport),
2306                 ireq->rmt_addr,
2307                 ntohs(ireq->rmt_port),
2308                 TCP_SYN_RECV,
2309                 0, 0, /* could print option size, but that is af dependent. */
2310                 1,    /* timers active (only the expire timer) */
2311                 jiffies_to_clock_t(ttd),
2312                 req->retrans,
2313                 uid,
2314                 0,  /* non standard timer */
2315                 0, /* open_requests have no inode */
2316                 atomic_read(&sk->sk_refcnt),
2317                 req,
2318                 len);
2319 }
2320
2321 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2322 {
2323         int timer_active;
2324         unsigned long timer_expires;
2325         struct tcp_sock *tp = tcp_sk(sk);
2326         const struct inet_connection_sock *icsk = inet_csk(sk);
2327         struct inet_sock *inet = inet_sk(sk);
2328         __be32 dest = inet->inet_daddr;
2329         __be32 src = inet->inet_rcv_saddr;
2330         __u16 destp = ntohs(inet->inet_dport);
2331         __u16 srcp = ntohs(inet->inet_sport);
2332         int rx_queue;
2333
2334         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2335                 timer_active    = 1;
2336                 timer_expires   = icsk->icsk_timeout;
2337         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2338                 timer_active    = 4;
2339                 timer_expires   = icsk->icsk_timeout;
2340         } else if (timer_pending(&sk->sk_timer)) {
2341                 timer_active    = 2;
2342                 timer_expires   = sk->sk_timer.expires;
2343         } else {
2344                 timer_active    = 0;
2345                 timer_expires = jiffies;
2346         }
2347
2348         if (sk->sk_state == TCP_LISTEN)
2349                 rx_queue = sk->sk_ack_backlog;
2350         else
2351                 /*
2352                  * because we dont lock socket, we might find a transient negative value
2353                  */
2354                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2355
2356         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2357                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2358                 i, src, srcp, dest, destp, sk->sk_state,
2359                 tp->write_seq - tp->snd_una,
2360                 rx_queue,
2361                 timer_active,
2362                 jiffies_to_clock_t(timer_expires - jiffies),
2363                 icsk->icsk_retransmits,
2364                 sock_i_uid(sk),
2365                 icsk->icsk_probes_out,
2366                 sock_i_ino(sk),
2367                 atomic_read(&sk->sk_refcnt), sk,
2368                 jiffies_to_clock_t(icsk->icsk_rto),
2369                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2370                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2371                 tp->snd_cwnd,
2372                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2373                 len);
2374 }
2375
2376 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2377                                struct seq_file *f, int i, int *len)
2378 {
2379         __be32 dest, src;
2380         __u16 destp, srcp;
2381         int ttd = tw->tw_ttd - jiffies;
2382
2383         if (ttd < 0)
2384                 ttd = 0;
2385
2386         dest  = tw->tw_daddr;
2387         src   = tw->tw_rcv_saddr;
2388         destp = ntohs(tw->tw_dport);
2389         srcp  = ntohs(tw->tw_sport);
2390
2391         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2392                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2393                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2394                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2395                 atomic_read(&tw->tw_refcnt), tw, len);
2396 }
2397
2398 #define TMPSZ 150
2399
2400 static int tcp4_seq_show(struct seq_file *seq, void *v)
2401 {
2402         struct tcp_iter_state *st;
2403         int len;
2404
2405         if (v == SEQ_START_TOKEN) {
2406                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2407                            "  sl  local_address rem_address   st tx_queue "
2408                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2409                            "inode");
2410                 goto out;
2411         }
2412         st = seq->private;
2413
2414         switch (st->state) {
2415         case TCP_SEQ_STATE_LISTENING:
2416         case TCP_SEQ_STATE_ESTABLISHED:
2417                 get_tcp4_sock(v, seq, st->num, &len);
2418                 break;
2419         case TCP_SEQ_STATE_OPENREQ:
2420                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2421                 break;
2422         case TCP_SEQ_STATE_TIME_WAIT:
2423                 get_timewait4_sock(v, seq, st->num, &len);
2424                 break;
2425         }
2426         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2427 out:
2428         return 0;
2429 }
2430
2431 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2432         .name           = "tcp",
2433         .family         = AF_INET,
2434         .seq_fops       = {
2435                 .owner          = THIS_MODULE,
2436         },
2437         .seq_ops        = {
2438                 .show           = tcp4_seq_show,
2439         },
2440 };
2441
2442 static int __net_init tcp4_proc_init_net(struct net *net)
2443 {
2444         return tcp_proc_register(net, &tcp4_seq_afinfo);
2445 }
2446
2447 static void __net_exit tcp4_proc_exit_net(struct net *net)
2448 {
2449         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2450 }
2451
2452 static struct pernet_operations tcp4_net_ops = {
2453         .init = tcp4_proc_init_net,
2454         .exit = tcp4_proc_exit_net,
2455 };
2456
2457 int __init tcp4_proc_init(void)
2458 {
2459         return register_pernet_subsys(&tcp4_net_ops);
2460 }
2461
2462 void tcp4_proc_exit(void)
2463 {
2464         unregister_pernet_subsys(&tcp4_net_ops);
2465 }
2466 #endif /* CONFIG_PROC_FS */
2467
2468 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2469 {
2470         struct iphdr *iph = skb_gro_network_header(skb);
2471
2472         switch (skb->ip_summed) {
2473         case CHECKSUM_COMPLETE:
2474                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2475                                   skb->csum)) {
2476                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2477                         break;
2478                 }
2479
2480                 /* fall through */
2481         case CHECKSUM_NONE:
2482                 NAPI_GRO_CB(skb)->flush = 1;
2483                 return NULL;
2484         }
2485
2486         return tcp_gro_receive(head, skb);
2487 }
2488 EXPORT_SYMBOL(tcp4_gro_receive);
2489
2490 int tcp4_gro_complete(struct sk_buff *skb)
2491 {
2492         struct iphdr *iph = ip_hdr(skb);
2493         struct tcphdr *th = tcp_hdr(skb);
2494
2495         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2496                                   iph->saddr, iph->daddr, 0);
2497         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2498
2499         return tcp_gro_complete(skb);
2500 }
2501 EXPORT_SYMBOL(tcp4_gro_complete);
2502
2503 struct proto tcp_prot = {
2504         .name                   = "TCP",
2505         .owner                  = THIS_MODULE,
2506         .close                  = tcp_close,
2507         .connect                = tcp_v4_connect,
2508         .disconnect             = tcp_disconnect,
2509         .accept                 = inet_csk_accept,
2510         .ioctl                  = tcp_ioctl,
2511         .init                   = tcp_v4_init_sock,
2512         .destroy                = tcp_v4_destroy_sock,
2513         .shutdown               = tcp_shutdown,
2514         .setsockopt             = tcp_setsockopt,
2515         .getsockopt             = tcp_getsockopt,
2516         .recvmsg                = tcp_recvmsg,
2517         .backlog_rcv            = tcp_v4_do_rcv,
2518         .hash                   = inet_hash,
2519         .unhash                 = inet_unhash,
2520         .get_port               = inet_csk_get_port,
2521         .enter_memory_pressure  = tcp_enter_memory_pressure,
2522         .sockets_allocated      = &tcp_sockets_allocated,
2523         .orphan_count           = &tcp_orphan_count,
2524         .memory_allocated       = &tcp_memory_allocated,
2525         .memory_pressure        = &tcp_memory_pressure,
2526         .sysctl_mem             = sysctl_tcp_mem,
2527         .sysctl_wmem            = sysctl_tcp_wmem,
2528         .sysctl_rmem            = sysctl_tcp_rmem,
2529         .max_header             = MAX_TCP_HEADER,
2530         .obj_size               = sizeof(struct tcp_sock),
2531         .slab_flags             = SLAB_DESTROY_BY_RCU,
2532         .twsk_prot              = &tcp_timewait_sock_ops,
2533         .rsk_prot               = &tcp_request_sock_ops,
2534         .h.hashinfo             = &tcp_hashinfo,
2535 #ifdef CONFIG_COMPAT
2536         .compat_setsockopt      = compat_tcp_setsockopt,
2537         .compat_getsockopt      = compat_tcp_getsockopt,
2538 #endif
2539 };
2540
2541
2542 static int __net_init tcp_sk_init(struct net *net)
2543 {
2544         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2545                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2546 }
2547
2548 static void __net_exit tcp_sk_exit(struct net *net)
2549 {
2550         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2551 }
2552
2553 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2554 {
2555         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2556 }
2557
2558 static struct pernet_operations __net_initdata tcp_sk_ops = {
2559        .init       = tcp_sk_init,
2560        .exit       = tcp_sk_exit,
2561        .exit_batch = tcp_sk_exit_batch,
2562 };
2563
2564 void __init tcp_v4_init(void)
2565 {
2566         inet_hashinfo_init(&tcp_hashinfo);
2567         if (register_pernet_subsys(&tcp_sk_ops))
2568                 panic("Failed to create the TCP control socket.\n");
2569 }
2570
2571 EXPORT_SYMBOL(ipv4_specific);
2572 EXPORT_SYMBOL(tcp_hashinfo);
2573 EXPORT_SYMBOL(tcp_prot);
2574 EXPORT_SYMBOL(tcp_v4_conn_request);
2575 EXPORT_SYMBOL(tcp_v4_connect);
2576 EXPORT_SYMBOL(tcp_v4_do_rcv);
2577 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2578 EXPORT_SYMBOL(tcp_v4_send_check);
2579 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2580
2581 #ifdef CONFIG_PROC_FS
2582 EXPORT_SYMBOL(tcp_proc_register);
2583 EXPORT_SYMBOL(tcp_proc_unregister);
2584 #endif
2585 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2586