GUI: Fix Tomato RAF theme for all builds. Compilation typo.
[tomato.git] / release / src-rt-6.x.4708 / linux / linux-2.6.36 / net / ipv4 / tcp_ipv4.c
blob2843b63128a681d84c907fbfe3cc63120ef06633
1 /* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
12 * code split from:
13 * linux/ipv4/tcp.c
14 * linux/ipv4/tcp_input.c
15 * linux/ipv4/tcp_output.c
17 * See tcp.c for author information
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
26 * Changes:
27 * David S. Miller : New socket lookup architecture.
28 * This code is dedicated to John Dyson.
29 * David S. Miller : Change semantics of established hash,
30 * half is devoted to TIME_WAIT sockets
31 * and the rest go in the other half.
32 * Andi Kleen : Add support for syncookies and fixed
33 * some bugs: ip options weren't passed to
34 * the TCP layer, missed a check for an
35 * ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * request_sock handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen semantics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * coma.
47 * Andi Kleen : Fix new listen.
48 * Andi Kleen : Fix accept error reporting.
49 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
50 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
51 * a single port at the same time.
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
86 #include <typedefs.h>
87 #include <bcmdefs.h>
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
94 #ifdef CONFIG_TCP_MD5SIG
95 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
96 __be32 addr);
97 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
98 __be32 daddr, __be32 saddr, struct tcphdr *th);
99 #else
100 static inline
101 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
103 return NULL;
105 #endif
107 struct inet_hashinfo tcp_hashinfo;
108 EXPORT_SYMBOL(tcp_hashinfo);
110 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
112 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
113 ip_hdr(skb)->saddr,
114 tcp_hdr(skb)->dest,
115 tcp_hdr(skb)->source);
118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 struct tcp_sock *tp = tcp_sk(sk);
123 /* With PAWS, it is safe from the viewpoint
124 of data integrity. Even without PAWS it is safe provided sequence
125 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127 Actually, the idea is close to VJ's one, only timestamp cache is
128 held not per host, but per port pair and TW bucket is used as state
129 holder.
131 If TW bucket has been already destroyed we fall back to VJ's scheme
132 and use initial timestamp retrieved from peer table.
134 if (tcptw->tw_ts_recent_stamp &&
135 (twp == NULL || (sysctl_tcp_tw_reuse &&
136 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
137 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
138 if (tp->write_seq == 0)
139 tp->write_seq = 1;
140 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
141 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
142 sock_hold(sktw);
143 return 1;
146 return 0;
148 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
150 /* This will initiate an outgoing connection. */
151 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
153 struct inet_sock *inet = inet_sk(sk);
154 struct tcp_sock *tp = tcp_sk(sk);
155 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
156 struct rtable *rt;
157 __be32 daddr, nexthop;
158 int tmp;
159 int err;
161 if (addr_len < sizeof(struct sockaddr_in))
162 return -EINVAL;
164 if (usin->sin_family != AF_INET)
165 return -EAFNOSUPPORT;
167 nexthop = daddr = usin->sin_addr.s_addr;
168 if (inet->opt && inet->opt->srr) {
169 if (!daddr)
170 return -EINVAL;
171 nexthop = inet->opt->faddr;
174 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
175 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
176 IPPROTO_TCP,
177 inet->inet_sport, usin->sin_port, sk, 1);
178 if (tmp < 0) {
179 if (tmp == -ENETUNREACH)
180 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 return tmp;
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
189 if (!inet->opt || !inet->opt->srr)
190 daddr = rt->rt_dst;
192 if (!inet->inet_saddr)
193 inet->inet_saddr = rt->rt_src;
194 inet->inet_rcv_saddr = inet->inet_saddr;
196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
200 tp->write_seq = 0;
203 if (tcp_death_row.sysctl_tw_recycle &&
204 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
205 struct inet_peer *peer = rt_get_peer(rt);
207 * VJ's idea. We save last timestamp seen from
208 * the destination in peer table, when entering state
209 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
210 * when trying new connection.
212 if (peer) {
213 inet_peer_refcheck(peer);
214 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
215 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216 tp->rx_opt.ts_recent = peer->tcp_ts;
221 inet->inet_dport = usin->sin_port;
222 inet->inet_daddr = daddr;
224 inet_csk(sk)->icsk_ext_hdr_len = 0;
225 if (inet->opt)
226 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
228 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
230 /* Socket identity is still unknown (sport may be zero).
231 * However we set state to SYN-SENT and not releasing socket
232 * lock select source port, enter ourselves into the hash tables and
233 * complete initialization after this.
235 tcp_set_state(sk, TCP_SYN_SENT);
236 err = inet_hash_connect(&tcp_death_row, sk);
237 if (err)
238 goto failure;
240 err = ip_route_newports(&rt, IPPROTO_TCP,
241 inet->inet_sport, inet->inet_dport, sk);
242 if (err)
243 goto failure;
245 /* OK, now commit destination to socket. */
246 sk->sk_gso_type = SKB_GSO_TCPV4;
247 sk_setup_caps(sk, &rt->dst);
249 if (!tp->write_seq)
250 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251 inet->inet_daddr,
252 inet->inet_sport,
253 usin->sin_port);
255 inet->inet_id = tp->write_seq ^ jiffies;
257 err = tcp_connect(sk);
258 rt = NULL;
259 if (err)
260 goto failure;
262 return 0;
264 failure:
266 * This unhashes the socket and releases the local port,
267 * if necessary.
269 tcp_set_state(sk, TCP_CLOSE);
270 ip_rt_put(rt);
271 sk->sk_route_caps = 0;
272 inet->inet_dport = 0;
273 return err;
275 EXPORT_SYMBOL(tcp_v4_connect);
278 * This routine does path mtu discovery as defined in RFC1191.
280 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
282 struct dst_entry *dst;
283 struct inet_sock *inet = inet_sk(sk);
285 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286 * send out by Linux are always <576bytes so they should go through
287 * unfragmented).
289 if (sk->sk_state == TCP_LISTEN)
290 return;
292 /* We don't check in the destentry if pmtu discovery is forbidden
293 * on this route. We just assume that no packet_to_big packets
294 * are send back when pmtu discovery is not active.
295 * There is a small race when the user changes this flag in the
296 * route, but I think that's acceptable.
298 if ((dst = __sk_dst_check(sk, 0)) == NULL)
299 return;
301 dst->ops->update_pmtu(dst, mtu);
303 /* Something is about to be wrong... Remember soft error
304 * for the case, if this connection will not able to recover.
306 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307 sk->sk_err_soft = EMSGSIZE;
309 mtu = dst_mtu(dst);
311 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
312 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
313 tcp_sync_mss(sk, mtu);
315 /* Resend the TCP packet because it's
316 * clear that the old packet has been
317 * dropped. This is the new "fast" path mtu
318 * discovery.
320 tcp_simple_retransmit(sk);
321 } /* else let the usual retransmit timer handle it */
325 * This routine is called by the ICMP module when it gets some
326 * sort of error condition. If err < 0 then the socket should
327 * be closed and the error returned to the user. If err > 0
328 * it's just the icmp type << 8 | icmp code. After adjustment
329 * header points to the first 8 bytes of the tcp header. We need
330 * to find the appropriate port.
332 * The locking strategy used here is very "optimistic". When
333 * someone else accesses the socket the ICMP is just dropped
334 * and for some paths there is no check at all.
335 * A more general error queue to queue errors for later handling
336 * is probably better.
340 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
342 struct iphdr *iph = (struct iphdr *)icmp_skb->data;
343 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
344 struct inet_connection_sock *icsk;
345 struct tcp_sock *tp;
346 struct inet_sock *inet;
347 const int type = icmp_hdr(icmp_skb)->type;
348 const int code = icmp_hdr(icmp_skb)->code;
349 struct sock *sk;
350 struct sk_buff *skb;
351 __u32 seq;
352 __u32 remaining;
353 int err;
354 struct net *net = dev_net(icmp_skb->dev);
356 if (icmp_skb->len < (iph->ihl << 2) + 8) {
357 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358 return;
361 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
362 iph->saddr, th->source, inet_iif(icmp_skb));
363 if (!sk) {
364 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365 return;
367 if (sk->sk_state == TCP_TIME_WAIT) {
368 inet_twsk_put(inet_twsk(sk));
369 return;
372 bh_lock_sock(sk);
373 /* If too many ICMPs get dropped on busy
374 * servers this needs to be solved differently.
376 if (sock_owned_by_user(sk))
377 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
379 if (sk->sk_state == TCP_CLOSE)
380 goto out;
382 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384 goto out;
387 icsk = inet_csk(sk);
388 tp = tcp_sk(sk);
389 seq = ntohl(th->seq);
390 if (sk->sk_state != TCP_LISTEN &&
391 !between(seq, tp->snd_una, tp->snd_nxt)) {
392 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 goto out;
396 switch (type) {
397 case ICMP_SOURCE_QUENCH:
398 /* Just silently ignore these. */
399 goto out;
400 case ICMP_PARAMETERPROB:
401 err = EPROTO;
402 break;
403 case ICMP_DEST_UNREACH:
404 if (code > NR_ICMP_UNREACH)
405 goto out;
407 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408 if (!sock_owned_by_user(sk))
409 do_pmtu_discovery(sk, iph, info);
410 goto out;
413 err = icmp_err_convert[code].errno;
414 /* check if icmp_skb allows revert of backoff
415 * (see draft-zimmermann-tcp-lcd) */
416 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417 break;
418 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
419 !icsk->icsk_backoff)
420 break;
422 if (sock_owned_by_user(sk))
423 break;
425 icsk->icsk_backoff--;
426 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
427 icsk->icsk_backoff;
428 tcp_bound_rto(sk);
430 skb = tcp_write_queue_head(sk);
431 BUG_ON(!skb);
433 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434 tcp_time_stamp - TCP_SKB_CB(skb)->when);
436 if (remaining) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
439 } else {
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now */
442 tcp_retransmit_timer(sk);
445 break;
446 case ICMP_TIME_EXCEEDED:
447 err = EHOSTUNREACH;
448 break;
449 default:
450 goto out;
453 switch (sk->sk_state) {
454 struct request_sock *req, **prev;
455 case TCP_LISTEN:
456 if (sock_owned_by_user(sk))
457 goto out;
459 req = inet_csk_search_req(sk, &prev, th->dest,
460 iph->daddr, iph->saddr);
461 if (!req)
462 goto out;
464 /* ICMPs are not backlogged, hence we cannot get
465 an established socket here.
467 WARN_ON(req->sk);
469 if (seq != tcp_rsk(req)->snt_isn) {
470 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
471 goto out;
475 * Still in SYN_RECV, just remove it silently.
476 * There is no good way to pass the error to the newly
477 * created socket, and POSIX does not want network
478 * errors returned from accept().
480 inet_csk_reqsk_queue_drop(sk, req, prev);
481 goto out;
483 case TCP_SYN_SENT:
484 case TCP_SYN_RECV: /* Cannot happen.
485 It can f.e. if SYNs crossed.
487 if (!sock_owned_by_user(sk)) {
488 sk->sk_err = err;
490 sk->sk_error_report(sk);
492 tcp_done(sk);
493 } else {
494 sk->sk_err_soft = err;
496 goto out;
499 /* If we've already connected we will keep trying
500 * until we time out, or the user gives up.
502 * rfc1122 4.2.3.9 allows to consider as hard errors
503 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504 * but it is obsoleted by pmtu discovery).
506 * Note, that in modern internet, where routing is unreliable
507 * and in each dark corner broken firewalls sit, sending random
508 * errors ordered by their masters even this two messages finally lose
509 * their original sense (even Linux sends invalid PORT_UNREACHs)
511 * Now we are in compliance with RFCs.
512 * --ANK (980905)
515 inet = inet_sk(sk);
516 if (!sock_owned_by_user(sk) && inet->recverr) {
517 sk->sk_err = err;
518 sk->sk_error_report(sk);
519 } else { /* Only an error on timeout */
520 sk->sk_err_soft = err;
523 out:
524 bh_unlock_sock(sk);
525 sock_put(sk);
528 static void __tcp_v4_send_check(struct sk_buff *skb,
529 __be32 saddr, __be32 daddr)
531 struct tcphdr *th = tcp_hdr(skb);
533 if (skb->ip_summed == CHECKSUM_PARTIAL) {
534 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
535 skb->csum_start = skb_transport_header(skb) - skb->head;
536 skb->csum_offset = offsetof(struct tcphdr, check);
537 } else {
538 th->check = tcp_v4_check(skb->len, saddr, daddr,
539 csum_partial(th,
540 th->doff << 2,
541 skb->csum));
545 /* This routine computes an IPv4 TCP checksum. */
546 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
548 struct inet_sock *inet = inet_sk(sk);
550 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
552 EXPORT_SYMBOL(tcp_v4_send_check);
554 int tcp_v4_gso_send_check(struct sk_buff *skb)
556 const struct iphdr *iph;
557 struct tcphdr *th;
559 if (!pskb_may_pull(skb, sizeof(*th)))
560 return -EINVAL;
562 iph = ip_hdr(skb);
563 th = tcp_hdr(skb);
565 th->check = 0;
566 skb->ip_summed = CHECKSUM_PARTIAL;
567 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
568 return 0;
572 * This routine will send an RST to the other tcp.
574 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575 * for reset.
576 * Answer: if a packet caused RST, it is not for a socket
577 * existing in our system, if it is matched to a socket,
578 * it is just duplicate segment or bug in other side's TCP.
579 * So that we build reply only basing on parameters
580 * arrived with segment.
581 * Exception: precedence violation. We do not implement it in any case.
584 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
586 struct tcphdr *th = tcp_hdr(skb);
587 struct {
588 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592 } rep;
593 struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595 struct tcp_md5sig_key *key;
596 #endif
597 struct net *net;
599 /* Never send a reset in response to a reset. */
600 if (th->rst)
601 return;
603 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
604 return;
606 /* Swap the send and the receive. */
607 memset(&rep, 0, sizeof(rep));
608 rep.th.dest = th->source;
609 rep.th.source = th->dest;
610 rep.th.doff = sizeof(struct tcphdr) / 4;
611 rep.th.rst = 1;
613 if (th->ack) {
614 rep.th.seq = th->ack_seq;
615 } else {
616 rep.th.ack = 1;
617 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
618 skb->len - (th->doff << 2));
621 memset(&arg, 0, sizeof(arg));
622 arg.iov[0].iov_base = (unsigned char *)&rep;
623 arg.iov[0].iov_len = sizeof(rep.th);
625 #ifdef CONFIG_TCP_MD5SIG
626 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
627 if (key) {
628 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
629 (TCPOPT_NOP << 16) |
630 (TCPOPT_MD5SIG << 8) |
631 TCPOLEN_MD5SIG);
632 /* Update length and the length the header thinks exists */
633 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
634 rep.th.doff = arg.iov[0].iov_len / 4;
636 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
637 key, ip_hdr(skb)->saddr,
638 ip_hdr(skb)->daddr, &rep.th);
640 #endif
641 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
642 ip_hdr(skb)->saddr,
643 arg.iov[0].iov_len, IPPROTO_TCP, 0);
644 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
645 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
647 net = dev_net(skb_dst(skb)->dev);
648 ip_send_reply(net->ipv4.tcp_sock, skb,
649 &arg, arg.iov[0].iov_len);
651 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
652 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
655 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
656 outside socket context is ugly, certainly. What can I do?
659 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
660 u32 win, u32 ts, int oif,
661 struct tcp_md5sig_key *key,
662 int reply_flags)
664 struct tcphdr *th = tcp_hdr(skb);
665 struct {
666 struct tcphdr th;
667 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
668 #ifdef CONFIG_TCP_MD5SIG
669 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
670 #endif
672 } rep;
673 struct ip_reply_arg arg;
674 struct net *net = dev_net(skb_dst(skb)->dev);
676 memset(&rep.th, 0, sizeof(struct tcphdr));
677 memset(&arg, 0, sizeof(arg));
679 arg.iov[0].iov_base = (unsigned char *)&rep;
680 arg.iov[0].iov_len = sizeof(rep.th);
681 if (ts) {
682 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
683 (TCPOPT_TIMESTAMP << 8) |
684 TCPOLEN_TIMESTAMP);
685 rep.opt[1] = htonl(tcp_time_stamp);
686 rep.opt[2] = htonl(ts);
687 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
690 /* Swap the send and the receive. */
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = arg.iov[0].iov_len / 4;
694 rep.th.seq = htonl(seq);
695 rep.th.ack_seq = htonl(ack);
696 rep.th.ack = 1;
697 rep.th.window = htons(win);
699 #ifdef CONFIG_TCP_MD5SIG
700 if (key) {
701 int offset = (ts) ? 3 : 0;
703 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
704 (TCPOPT_NOP << 16) |
705 (TCPOPT_MD5SIG << 8) |
706 TCPOLEN_MD5SIG);
707 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
708 rep.th.doff = arg.iov[0].iov_len/4;
710 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
711 key, ip_hdr(skb)->saddr,
712 ip_hdr(skb)->daddr, &rep.th);
714 #endif
715 arg.flags = reply_flags;
716 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
717 ip_hdr(skb)->saddr,
718 arg.iov[0].iov_len, IPPROTO_TCP, 0);
719 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
720 if (oif)
721 arg.bound_dev_if = oif;
723 ip_send_reply(net->ipv4.tcp_sock, skb,
724 &arg, arg.iov[0].iov_len);
726 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
729 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
731 struct inet_timewait_sock *tw = inet_twsk(sk);
732 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
734 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
735 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
736 tcptw->tw_ts_recent,
737 tw->tw_bound_dev_if,
738 tcp_twsk_md5_key(tcptw),
739 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
742 inet_twsk_put(tw);
745 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
746 struct request_sock *req)
748 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
749 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
750 req->ts_recent,
752 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
753 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
757 * Send a SYN-ACK after having received a SYN.
758 * This still operates on a request_sock only, not on a big
759 * socket.
761 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
762 struct request_sock *req,
763 struct request_values *rvp)
765 const struct inet_request_sock *ireq = inet_rsk(req);
766 int err = -1;
767 struct sk_buff * skb;
769 /* First, grab a route. */
770 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
771 return -1;
773 skb = tcp_make_synack(sk, dst, req, rvp);
775 if (skb) {
776 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
778 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
779 ireq->rmt_addr,
780 ireq->opt);
781 err = net_xmit_eval(err);
784 dst_release(dst);
785 return err;
788 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
789 struct request_values *rvp)
791 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
792 return tcp_v4_send_synack(sk, NULL, req, rvp);
796 * IPv4 request_sock destructor.
798 static void tcp_v4_reqsk_destructor(struct request_sock *req)
800 kfree(inet_rsk(req)->opt);
803 static void syn_flood_warning(const struct sk_buff *skb)
805 const char *msg;
807 #ifdef CONFIG_SYN_COOKIES
808 if (sysctl_tcp_syncookies)
809 msg = "Sending cookies";
810 else
811 #endif
812 msg = "Dropping request";
814 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
815 ntohs(tcp_hdr(skb)->dest), msg);
819 * Save and compile IPv4 options into the request_sock if needed.
821 static struct ip_options *tcp_v4_save_options(struct sock *sk,
822 struct sk_buff *skb)
824 struct ip_options *opt = &(IPCB(skb)->opt);
825 struct ip_options *dopt = NULL;
827 if (opt && opt->optlen) {
828 int opt_size = optlength(opt);
829 dopt = kmalloc(opt_size, GFP_ATOMIC);
830 if (dopt) {
831 if (ip_options_echo(dopt, skb)) {
832 kfree(dopt);
833 dopt = NULL;
837 return dopt;
840 #ifdef CONFIG_TCP_MD5SIG
842 * RFC2385 MD5 checksumming requires a mapping of
843 * IP address->MD5 Key.
844 * We need to maintain these in the sk structure.
847 /* Find the Key structure for an address. */
848 static struct tcp_md5sig_key *
849 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
851 struct tcp_sock *tp = tcp_sk(sk);
852 int i;
854 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
855 return NULL;
856 for (i = 0; i < tp->md5sig_info->entries4; i++) {
857 if (tp->md5sig_info->keys4[i].addr == addr)
858 return &tp->md5sig_info->keys4[i].base;
860 return NULL;
863 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
864 struct sock *addr_sk)
866 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
868 EXPORT_SYMBOL(tcp_v4_md5_lookup);
870 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
871 struct request_sock *req)
873 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
876 /* This can be called on a newly created socket, from other files */
877 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
878 u8 *newkey, u8 newkeylen)
880 /* Add Key to the list */
881 struct tcp_md5sig_key *key;
882 struct tcp_sock *tp = tcp_sk(sk);
883 struct tcp4_md5sig_key *keys;
885 key = tcp_v4_md5_do_lookup(sk, addr);
886 if (key) {
887 /* Pre-existing entry - just update that one. */
888 kfree(key->key);
889 key->key = newkey;
890 key->keylen = newkeylen;
891 } else {
892 struct tcp_md5sig_info *md5sig;
894 if (!tp->md5sig_info) {
895 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
896 GFP_ATOMIC);
897 if (!tp->md5sig_info) {
898 kfree(newkey);
899 return -ENOMEM;
901 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
903 if (tcp_alloc_md5sig_pool(sk) == NULL) {
904 kfree(newkey);
905 return -ENOMEM;
907 md5sig = tp->md5sig_info;
909 if (md5sig->alloced4 == md5sig->entries4) {
910 keys = kmalloc((sizeof(*keys) *
911 (md5sig->entries4 + 1)), GFP_ATOMIC);
912 if (!keys) {
913 kfree(newkey);
914 tcp_free_md5sig_pool();
915 return -ENOMEM;
918 if (md5sig->entries4)
919 memcpy(keys, md5sig->keys4,
920 sizeof(*keys) * md5sig->entries4);
922 /* Free old key list, and reference new one */
923 kfree(md5sig->keys4);
924 md5sig->keys4 = keys;
925 md5sig->alloced4++;
927 md5sig->entries4++;
928 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
929 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
930 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
932 return 0;
934 EXPORT_SYMBOL(tcp_v4_md5_do_add);
936 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
937 u8 *newkey, u8 newkeylen)
939 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
940 newkey, newkeylen);
943 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
945 struct tcp_sock *tp = tcp_sk(sk);
946 int i;
948 for (i = 0; i < tp->md5sig_info->entries4; i++) {
949 if (tp->md5sig_info->keys4[i].addr == addr) {
950 /* Free the key */
951 kfree(tp->md5sig_info->keys4[i].base.key);
952 tp->md5sig_info->entries4--;
954 if (tp->md5sig_info->entries4 == 0) {
955 kfree(tp->md5sig_info->keys4);
956 tp->md5sig_info->keys4 = NULL;
957 tp->md5sig_info->alloced4 = 0;
958 } else if (tp->md5sig_info->entries4 != i) {
959 /* Need to do some manipulation */
960 memmove(&tp->md5sig_info->keys4[i],
961 &tp->md5sig_info->keys4[i+1],
962 (tp->md5sig_info->entries4 - i) *
963 sizeof(struct tcp4_md5sig_key));
965 tcp_free_md5sig_pool();
966 return 0;
969 return -ENOENT;
971 EXPORT_SYMBOL(tcp_v4_md5_do_del);
973 static void tcp_v4_clear_md5_list(struct sock *sk)
975 struct tcp_sock *tp = tcp_sk(sk);
977 /* Free each key, then the set of key keys,
978 * the crypto element, and then decrement our
979 * hold on the last resort crypto.
981 if (tp->md5sig_info->entries4) {
982 int i;
983 for (i = 0; i < tp->md5sig_info->entries4; i++)
984 kfree(tp->md5sig_info->keys4[i].base.key);
985 tp->md5sig_info->entries4 = 0;
986 tcp_free_md5sig_pool();
988 if (tp->md5sig_info->keys4) {
989 kfree(tp->md5sig_info->keys4);
990 tp->md5sig_info->keys4 = NULL;
991 tp->md5sig_info->alloced4 = 0;
995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996 int optlen)
998 struct tcp_md5sig cmd;
999 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000 u8 *newkey;
1002 if (optlen < sizeof(cmd))
1003 return -EINVAL;
1005 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1006 return -EFAULT;
1008 if (sin->sin_family != AF_INET)
1009 return -EINVAL;
1011 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1012 if (!tcp_sk(sk)->md5sig_info)
1013 return -ENOENT;
1014 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1017 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1018 return -EINVAL;
1020 if (!tcp_sk(sk)->md5sig_info) {
1021 struct tcp_sock *tp = tcp_sk(sk);
1022 struct tcp_md5sig_info *p;
1024 p = kzalloc(sizeof(*p), sk->sk_allocation);
1025 if (!p)
1026 return -EINVAL;
1028 tp->md5sig_info = p;
1029 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1032 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1033 if (!newkey)
1034 return -ENOMEM;
1035 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1036 newkey, cmd.tcpm_keylen);
1039 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1040 __be32 daddr, __be32 saddr, int nbytes)
1042 struct tcp4_pseudohdr *bp;
1043 struct scatterlist sg;
1045 bp = &hp->md5_blk.ip4;
1048 * 1. the TCP pseudo-header (in the order: source IP address,
1049 * destination IP address, zero-padded protocol number, and
1050 * segment length)
1052 bp->saddr = saddr;
1053 bp->daddr = daddr;
1054 bp->pad = 0;
1055 bp->protocol = IPPROTO_TCP;
1056 bp->len = cpu_to_be16(nbytes);
1058 sg_init_one(&sg, bp, sizeof(*bp));
1059 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1062 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1063 __be32 daddr, __be32 saddr, struct tcphdr *th)
1065 struct tcp_md5sig_pool *hp;
1066 struct hash_desc *desc;
1068 hp = tcp_get_md5sig_pool();
1069 if (!hp)
1070 goto clear_hash_noput;
1071 desc = &hp->md5_desc;
1073 if (crypto_hash_init(desc))
1074 goto clear_hash;
1075 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1076 goto clear_hash;
1077 if (tcp_md5_hash_header(hp, th))
1078 goto clear_hash;
1079 if (tcp_md5_hash_key(hp, key))
1080 goto clear_hash;
1081 if (crypto_hash_final(desc, md5_hash))
1082 goto clear_hash;
1084 tcp_put_md5sig_pool();
1085 return 0;
1087 clear_hash:
1088 tcp_put_md5sig_pool();
1089 clear_hash_noput:
1090 memset(md5_hash, 0, 16);
1091 return 1;
1094 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1095 struct sock *sk, struct request_sock *req,
1096 struct sk_buff *skb)
1098 struct tcp_md5sig_pool *hp;
1099 struct hash_desc *desc;
1100 struct tcphdr *th = tcp_hdr(skb);
1101 __be32 saddr, daddr;
1103 if (sk) {
1104 saddr = inet_sk(sk)->inet_saddr;
1105 daddr = inet_sk(sk)->inet_daddr;
1106 } else if (req) {
1107 saddr = inet_rsk(req)->loc_addr;
1108 daddr = inet_rsk(req)->rmt_addr;
1109 } else {
1110 const struct iphdr *iph = ip_hdr(skb);
1111 saddr = iph->saddr;
1112 daddr = iph->daddr;
1115 hp = tcp_get_md5sig_pool();
1116 if (!hp)
1117 goto clear_hash_noput;
1118 desc = &hp->md5_desc;
1120 if (crypto_hash_init(desc))
1121 goto clear_hash;
1123 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1124 goto clear_hash;
1125 if (tcp_md5_hash_header(hp, th))
1126 goto clear_hash;
1127 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1128 goto clear_hash;
1129 if (tcp_md5_hash_key(hp, key))
1130 goto clear_hash;
1131 if (crypto_hash_final(desc, md5_hash))
1132 goto clear_hash;
1134 tcp_put_md5sig_pool();
1135 return 0;
1137 clear_hash:
1138 tcp_put_md5sig_pool();
1139 clear_hash_noput:
1140 memset(md5_hash, 0, 16);
1141 return 1;
1143 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1145 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1148 * This gets called for each TCP segment that arrives
1149 * so we want to be efficient.
1150 * We have 3 drop cases:
1151 * o No MD5 hash and one expected.
1152 * o MD5 hash and we're not expecting one.
1153 * o MD5 hash and its wrong.
1155 __u8 *hash_location = NULL;
1156 struct tcp_md5sig_key *hash_expected;
1157 const struct iphdr *iph = ip_hdr(skb);
1158 struct tcphdr *th = tcp_hdr(skb);
1159 int genhash;
1160 unsigned char newhash[16];
1162 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1163 hash_location = tcp_parse_md5sig_option(th);
1165 /* We've parsed the options - do we have a hash? */
1166 if (!hash_expected && !hash_location)
1167 return 0;
1169 if (hash_expected && !hash_location) {
1170 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1171 return 1;
1174 if (!hash_expected && hash_location) {
1175 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1176 return 1;
1179 /* Okay, so this is hash_expected and hash_location -
1180 * so we need to calculate the checksum.
1182 genhash = tcp_v4_md5_hash_skb(newhash,
1183 hash_expected,
1184 NULL, NULL, skb);
1186 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1187 if (net_ratelimit()) {
1188 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1189 &iph->saddr, ntohs(th->source),
1190 &iph->daddr, ntohs(th->dest),
1191 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1193 return 1;
1195 return 0;
1198 #endif
1200 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1201 .family = PF_INET,
1202 .obj_size = sizeof(struct tcp_request_sock),
1203 .rtx_syn_ack = tcp_v4_rtx_synack,
1204 .send_ack = tcp_v4_reqsk_send_ack,
1205 .destructor = tcp_v4_reqsk_destructor,
1206 .send_reset = tcp_v4_send_reset,
1207 .syn_ack_timeout = tcp_syn_ack_timeout,
1210 #ifdef CONFIG_TCP_MD5SIG
1211 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1212 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1213 .calc_md5_hash = tcp_v4_md5_hash_skb,
1215 #endif
1217 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1218 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1219 .twsk_unique = tcp_twsk_unique,
1220 .twsk_destructor= tcp_twsk_destructor,
1223 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1225 struct tcp_extend_values tmp_ext;
1226 struct tcp_options_received tmp_opt;
1227 u8 *hash_location;
1228 struct request_sock *req;
1229 struct inet_request_sock *ireq;
1230 struct tcp_sock *tp = tcp_sk(sk);
1231 struct dst_entry *dst = NULL;
1232 __be32 saddr = ip_hdr(skb)->saddr;
1233 __be32 daddr = ip_hdr(skb)->daddr;
1234 __u32 isn = TCP_SKB_CB(skb)->when;
1235 #ifdef CONFIG_SYN_COOKIES
1236 int want_cookie = 0;
1237 #else
1238 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1239 #endif
1241 /* Never answer to SYNs send to broadcast or multicast */
1242 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243 goto drop;
1245 /* TW buckets are converted to open requests without
1246 * limitations, they conserve resources and peer is
1247 * evidently real one.
1249 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1250 if (net_ratelimit())
1251 syn_flood_warning(skb);
1252 #ifdef CONFIG_SYN_COOKIES
1253 if (sysctl_tcp_syncookies) {
1254 want_cookie = 1;
1255 } else
1256 #endif
1257 goto drop;
1260 /* Accept backlog is full. If we have already queued enough
1261 * of warm entries in syn queue, drop request. It is better than
1262 * clogging syn queue with openreqs with exponentially increasing
1263 * timeout.
1265 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1266 goto drop;
1268 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1269 if (!req)
1270 goto drop;
1272 #ifdef CONFIG_TCP_MD5SIG
1273 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1274 #endif
1276 tcp_clear_options(&tmp_opt);
1277 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1278 tmp_opt.user_mss = tp->rx_opt.user_mss;
1279 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1281 if (tmp_opt.cookie_plus > 0 &&
1282 tmp_opt.saw_tstamp &&
1283 !tp->rx_opt.cookie_out_never &&
1284 (sysctl_tcp_cookie_size > 0 ||
1285 (tp->cookie_values != NULL &&
1286 tp->cookie_values->cookie_desired > 0))) {
1287 u8 *c;
1288 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1289 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1291 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1292 goto drop_and_release;
1294 /* Secret recipe starts with IP addresses */
1295 *mess++ ^= (__force u32)daddr;
1296 *mess++ ^= (__force u32)saddr;
1298 /* plus variable length Initiator Cookie */
1299 c = (u8 *)mess;
1300 while (l-- > 0)
1301 *c++ ^= *hash_location++;
1303 #ifdef CONFIG_SYN_COOKIES
1304 want_cookie = 0; /* not our kind of cookie */
1305 #endif
1306 tmp_ext.cookie_out_never = 0; /* false */
1307 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1308 } else if (!tp->rx_opt.cookie_in_always) {
1309 /* redundant indications, but ensure initialization. */
1310 tmp_ext.cookie_out_never = 1; /* true */
1311 tmp_ext.cookie_plus = 0;
1312 } else {
1313 goto drop_and_release;
1315 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1317 if (want_cookie && !tmp_opt.saw_tstamp)
1318 tcp_clear_options(&tmp_opt);
1320 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1321 tcp_openreq_init(req, &tmp_opt, skb);
1323 ireq = inet_rsk(req);
1324 ireq->loc_addr = daddr;
1325 ireq->rmt_addr = saddr;
1326 ireq->no_srccheck = inet_sk(sk)->transparent;
1327 ireq->opt = tcp_v4_save_options(sk, skb);
1329 if (security_inet_conn_request(sk, skb, req))
1330 goto drop_and_free;
1332 if (!want_cookie || tmp_opt.tstamp_ok)
1333 TCP_ECN_create_request(req, tcp_hdr(skb));
1335 if (want_cookie) {
1336 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337 req->cookie_ts = tmp_opt.tstamp_ok;
1338 } else if (!isn) {
1339 struct inet_peer *peer = NULL;
1341 /* VJ's idea. We save last timestamp seen
1342 * from the destination in peer table, when entering
1343 * state TIME-WAIT, and check against it before
1344 * accepting new connection request.
1346 * If "isn" is not zero, this request hit alive
1347 * timewait bucket, so that all the necessary checks
1348 * are made in the function processing timewait state.
1350 if (tmp_opt.saw_tstamp &&
1351 tcp_death_row.sysctl_tw_recycle &&
1352 (dst = inet_csk_route_req(sk, req)) != NULL &&
1353 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1354 peer->v4daddr == saddr) {
1355 inet_peer_refcheck(peer);
1356 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1357 (s32)(peer->tcp_ts - req->ts_recent) >
1358 TCP_PAWS_WINDOW) {
1359 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1360 goto drop_and_release;
1363 /* Kill the following clause, if you dislike this way. */
1364 else if (!sysctl_tcp_syncookies &&
1365 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1366 (sysctl_max_syn_backlog >> 2)) &&
1367 (!peer || !peer->tcp_ts_stamp) &&
1368 (!dst || !dst_metric(dst, RTAX_RTT))) {
1369 /* Without syncookies last quarter of
1370 * backlog is filled with destinations,
1371 * proven to be alive.
1372 * It means that we continue to communicate
1373 * to destinations, already remembered
1374 * to the moment of synflood.
1376 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1377 &saddr, ntohs(tcp_hdr(skb)->source));
1378 goto drop_and_release;
1381 isn = tcp_v4_init_sequence(skb);
1383 tcp_rsk(req)->snt_isn = isn;
1385 if (tcp_v4_send_synack(sk, dst, req,
1386 (struct request_values *)&tmp_ext) ||
1387 want_cookie)
1388 goto drop_and_free;
1390 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1391 return 0;
1393 drop_and_release:
1394 dst_release(dst);
1395 drop_and_free:
1396 reqsk_free(req);
1397 drop:
1398 return 0;
1400 EXPORT_SYMBOL(tcp_v4_conn_request);
1404 * The three way handshake has completed - we got a valid synack -
1405 * now create the new socket.
1407 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1408 struct request_sock *req,
1409 struct dst_entry *dst)
1411 struct inet_request_sock *ireq;
1412 struct inet_sock *newinet;
1413 struct tcp_sock *newtp;
1414 struct sock *newsk;
1415 #ifdef CONFIG_TCP_MD5SIG
1416 struct tcp_md5sig_key *key;
1417 #endif
1419 if (sk_acceptq_is_full(sk))
1420 goto exit_overflow;
1422 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1423 goto exit;
1425 newsk = tcp_create_openreq_child(sk, req, skb);
1426 if (!newsk)
1427 goto exit;
1429 newsk->sk_gso_type = SKB_GSO_TCPV4;
1430 sk_setup_caps(newsk, dst);
1432 newtp = tcp_sk(newsk);
1433 newinet = inet_sk(newsk);
1434 ireq = inet_rsk(req);
1435 newinet->inet_daddr = ireq->rmt_addr;
1436 newinet->inet_rcv_saddr = ireq->loc_addr;
1437 newinet->inet_saddr = ireq->loc_addr;
1438 newinet->opt = ireq->opt;
1439 ireq->opt = NULL;
1440 newinet->mc_index = inet_iif(skb);
1441 newinet->mc_ttl = ip_hdr(skb)->ttl;
1442 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1443 if (newinet->opt)
1444 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1445 newinet->inet_id = newtp->write_seq ^ jiffies;
1447 tcp_mtup_init(newsk);
1448 tcp_sync_mss(newsk, dst_mtu(dst));
1449 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1450 if (tcp_sk(sk)->rx_opt.user_mss &&
1451 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1452 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1454 tcp_initialize_rcv_mss(newsk);
1456 #ifdef CONFIG_TCP_MD5SIG
1457 /* Copy over the MD5 key from the original socket */
1458 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1459 if (key != NULL) {
1461 * We're using one, so create a matching key
1462 * on the newsk structure. If we fail to get
1463 * memory, then we end up not copying the key
1464 * across. Shucks.
1466 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1467 if (newkey != NULL)
1468 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1469 newkey, key->keylen);
1470 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1472 #endif
1474 __inet_hash_nolisten(newsk, NULL);
1475 __inet_inherit_port(sk, newsk);
1477 return newsk;
1479 exit_overflow:
1480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1481 exit:
1482 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1483 dst_release(dst);
1484 return NULL;
1486 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1488 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1490 struct tcphdr *th = tcp_hdr(skb);
1491 const struct iphdr *iph = ip_hdr(skb);
1492 struct sock *nsk;
1493 struct request_sock **prev;
1494 /* Find possible connection requests. */
1495 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1496 iph->saddr, iph->daddr);
1497 if (req)
1498 return tcp_check_req(sk, skb, req, prev);
1500 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1501 th->source, iph->daddr, th->dest, inet_iif(skb));
1503 if (nsk) {
1504 if (nsk->sk_state != TCP_TIME_WAIT) {
1505 bh_lock_sock(nsk);
1506 return nsk;
1508 inet_twsk_put(inet_twsk(nsk));
1509 return NULL;
1512 #ifdef CONFIG_SYN_COOKIES
1513 if (!th->syn)
1514 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1515 #endif
1516 return sk;
1519 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1521 const struct iphdr *iph = ip_hdr(skb);
1523 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1524 if (!tcp_v4_check(skb->len, iph->saddr,
1525 iph->daddr, skb->csum)) {
1526 skb->ip_summed = CHECKSUM_UNNECESSARY;
1527 return 0;
1531 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1532 skb->len, IPPROTO_TCP, 0);
1534 if (skb->len <= 76) {
1535 return __skb_checksum_complete(skb);
1537 return 0;
1541 /* The socket must have it's spinlock held when we get
1542 * here.
1544 * We have a potential double-lock case here, so even when
1545 * doing backlog processing we use the BH locking scheme.
1546 * This is because we cannot sleep with the original spinlock
1547 * held.
1549 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1551 struct sock *rsk;
1552 #ifdef CONFIG_TCP_MD5SIG
1554 * We really want to reject the packet as early as possible
1555 * if:
1556 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1557 * o There is an MD5 option and we're not expecting one
1559 if (tcp_v4_inbound_md5_hash(sk, skb))
1560 goto discard;
1561 #endif
1563 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1564 sock_rps_save_rxhash(sk, skb->rxhash);
1565 TCP_CHECK_TIMER(sk);
1566 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1567 rsk = sk;
1568 goto reset;
1570 TCP_CHECK_TIMER(sk);
1571 return 0;
1574 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1575 goto csum_err;
1577 if (sk->sk_state == TCP_LISTEN) {
1578 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1579 if (!nsk)
1580 goto discard;
1582 if (nsk != sk) {
1583 if (tcp_child_process(sk, nsk, skb)) {
1584 rsk = nsk;
1585 goto reset;
1587 return 0;
1589 } else
1590 sock_rps_save_rxhash(sk, skb->rxhash);
1593 TCP_CHECK_TIMER(sk);
1594 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1595 rsk = sk;
1596 goto reset;
1598 TCP_CHECK_TIMER(sk);
1599 return 0;
1601 reset:
1602 tcp_v4_send_reset(rsk, skb);
1603 discard:
1604 kfree_skb(skb);
1605 /* Be careful here. If this function gets more complicated and
1606 * gcc suffers from register pressure on the x86, sk (in %ebx)
1607 * might be destroyed here. This current version compiles correctly,
1608 * but you have been warned.
1610 return 0;
1612 csum_err:
1613 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1614 goto discard;
1616 EXPORT_SYMBOL(tcp_v4_do_rcv);
1619 * From tcp_input.c
1622 int BCMFASTPATH_HOST tcp_v4_rcv(struct sk_buff *skb)
1624 const struct iphdr *iph;
1625 struct tcphdr *th;
1626 struct sock *sk;
1627 int ret;
1628 struct net *net = dev_net(skb->dev);
1630 if (skb->pkt_type != PACKET_HOST)
1631 goto discard_it;
1633 /* Count it even if it's bad */
1634 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1636 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1637 goto discard_it;
1639 th = tcp_hdr(skb);
1641 if (th->doff < sizeof(struct tcphdr) / 4)
1642 goto bad_packet;
1643 if (!pskb_may_pull(skb, th->doff * 4))
1644 goto discard_it;
1646 /* An explanation is required here, I think.
1647 * Packet length and doff are validated by header prediction,
1648 * provided case of th->doff==0 is eliminated.
1649 * So, we defer the checks. */
1650 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1651 goto bad_packet;
1653 th = tcp_hdr(skb);
1654 iph = ip_hdr(skb);
1655 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1656 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1657 skb->len - th->doff * 4);
1658 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1659 TCP_SKB_CB(skb)->when = 0;
1660 TCP_SKB_CB(skb)->flags = iph->tos;
1661 TCP_SKB_CB(skb)->sacked = 0;
1663 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1664 if (!sk)
1665 goto no_tcp_socket;
1667 process:
1668 if (sk->sk_state == TCP_TIME_WAIT)
1669 goto do_time_wait;
1671 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1672 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1673 goto discard_and_relse;
1676 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1677 goto discard_and_relse;
1678 nf_reset(skb);
1680 if (sk_filter(sk, skb))
1681 goto discard_and_relse;
1683 skb->dev = NULL;
1685 bh_lock_sock_nested(sk);
1686 ret = 0;
1687 if (!sock_owned_by_user(sk)) {
1688 #ifdef CONFIG_NET_DMA
1689 struct tcp_sock *tp = tcp_sk(sk);
1690 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1691 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1692 if (tp->ucopy.dma_chan)
1693 ret = tcp_v4_do_rcv(sk, skb);
1694 else
1695 #endif
1697 if (!tcp_prequeue(sk, skb))
1698 ret = tcp_v4_do_rcv(sk, skb);
1700 } else if (unlikely(sk_add_backlog(sk, skb))) {
1701 bh_unlock_sock(sk);
1702 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1703 goto discard_and_relse;
1705 bh_unlock_sock(sk);
1707 sock_put(sk);
1709 return ret;
1711 no_tcp_socket:
1712 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1713 goto discard_it;
1715 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1716 bad_packet:
1717 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1718 } else {
1719 tcp_v4_send_reset(NULL, skb);
1722 discard_it:
1723 /* Discard frame. */
1724 kfree_skb(skb);
1725 return 0;
1727 discard_and_relse:
1728 sock_put(sk);
1729 goto discard_it;
1731 do_time_wait:
1732 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1733 inet_twsk_put(inet_twsk(sk));
1734 goto discard_it;
1737 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1738 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1739 inet_twsk_put(inet_twsk(sk));
1740 goto discard_it;
1742 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1743 case TCP_TW_SYN: {
1744 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1745 &tcp_hashinfo,
1746 iph->daddr, th->dest,
1747 inet_iif(skb));
1748 if (sk2) {
1749 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1750 inet_twsk_put(inet_twsk(sk));
1751 sk = sk2;
1752 goto process;
1754 /* Fall through to ACK */
1756 case TCP_TW_ACK:
1757 tcp_v4_timewait_ack(sk, skb);
1758 break;
1759 case TCP_TW_RST:
1760 goto no_tcp_socket;
1761 case TCP_TW_SUCCESS:;
1763 goto discard_it;
1766 /* VJ's idea. Save last timestamp seen from this destination
1767 * and hold it at least for normal timewait interval to use for duplicate
1768 * segment detection in subsequent connections, before they enter synchronized
1769 * state.
1772 int tcp_v4_remember_stamp(struct sock *sk)
1774 struct inet_sock *inet = inet_sk(sk);
1775 struct tcp_sock *tp = tcp_sk(sk);
1776 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1777 struct inet_peer *peer = NULL;
1778 int release_it = 0;
1780 if (!rt || rt->rt_dst != inet->inet_daddr) {
1781 peer = inet_getpeer(inet->inet_daddr, 1);
1782 release_it = 1;
1783 } else {
1784 if (!rt->peer)
1785 rt_bind_peer(rt, 1);
1786 peer = rt->peer;
1789 if (peer) {
1790 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1791 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1792 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1793 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1794 peer->tcp_ts = tp->rx_opt.ts_recent;
1796 if (release_it)
1797 inet_putpeer(peer);
1798 return 1;
1801 return 0;
1803 EXPORT_SYMBOL(tcp_v4_remember_stamp);
1805 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1807 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1809 if (peer) {
1810 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1812 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1813 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1814 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1815 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1816 peer->tcp_ts = tcptw->tw_ts_recent;
1818 inet_putpeer(peer);
1819 return 1;
1822 return 0;
1825 const struct inet_connection_sock_af_ops ipv4_specific = {
1826 .queue_xmit = ip_queue_xmit,
1827 .send_check = tcp_v4_send_check,
1828 .rebuild_header = inet_sk_rebuild_header,
1829 .conn_request = tcp_v4_conn_request,
1830 .syn_recv_sock = tcp_v4_syn_recv_sock,
1831 .remember_stamp = tcp_v4_remember_stamp,
1832 .net_header_len = sizeof(struct iphdr),
1833 .setsockopt = ip_setsockopt,
1834 .getsockopt = ip_getsockopt,
1835 .addr2sockaddr = inet_csk_addr2sockaddr,
1836 .sockaddr_len = sizeof(struct sockaddr_in),
1837 .bind_conflict = inet_csk_bind_conflict,
1838 #ifdef CONFIG_COMPAT
1839 .compat_setsockopt = compat_ip_setsockopt,
1840 .compat_getsockopt = compat_ip_getsockopt,
1841 #endif
1843 EXPORT_SYMBOL(ipv4_specific);
1845 #ifdef CONFIG_TCP_MD5SIG
1846 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1847 .md5_lookup = tcp_v4_md5_lookup,
1848 .calc_md5_hash = tcp_v4_md5_hash_skb,
1849 .md5_add = tcp_v4_md5_add_func,
1850 .md5_parse = tcp_v4_parse_md5_keys,
1852 #endif
1854 /* NOTE: A lot of things set to zero explicitly by call to
1855 * sk_alloc() so need not be done here.
1857 static int tcp_v4_init_sock(struct sock *sk)
1859 struct inet_connection_sock *icsk = inet_csk(sk);
1860 struct tcp_sock *tp = tcp_sk(sk);
1862 skb_queue_head_init(&tp->out_of_order_queue);
1863 tcp_init_xmit_timers(sk);
1864 tcp_prequeue_init(tp);
1866 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1867 tp->mdev = TCP_TIMEOUT_INIT;
1869 /* So many TCP implementations out there (incorrectly) count the
1870 * initial SYN frame in their delayed-ACK and congestion control
1871 * algorithms that we must have the following bandaid to talk
1872 * efficiently to them. -DaveM
1874 tp->snd_cwnd = 2;
1876 /* See draft-stevens-tcpca-spec-01 for discussion of the
1877 * initialization of these values.
1879 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1880 tp->snd_cwnd_clamp = ~0;
1881 tp->mss_cache = TCP_MSS_DEFAULT;
1883 tp->reordering = sysctl_tcp_reordering;
1884 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1886 sk->sk_state = TCP_CLOSE;
1888 sk->sk_write_space = sk_stream_write_space;
1889 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1891 icsk->icsk_af_ops = &ipv4_specific;
1892 icsk->icsk_sync_mss = tcp_sync_mss;
1893 #ifdef CONFIG_TCP_MD5SIG
1894 tp->af_specific = &tcp_sock_ipv4_specific;
1895 #endif
1897 /* TCP Cookie Transactions */
1898 if (sysctl_tcp_cookie_size > 0) {
1899 /* Default, cookies without s_data_payload. */
1900 tp->cookie_values =
1901 kzalloc(sizeof(*tp->cookie_values),
1902 sk->sk_allocation);
1903 if (tp->cookie_values != NULL)
1904 kref_init(&tp->cookie_values->kref);
1906 /* Presumed zeroed, in order of appearance:
1907 * cookie_in_always, cookie_out_never,
1908 * s_data_constant, s_data_in, s_data_out
1910 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1911 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1913 local_bh_disable();
1914 percpu_counter_inc(&tcp_sockets_allocated);
1915 local_bh_enable();
1917 return 0;
1920 void tcp_v4_destroy_sock(struct sock *sk)
1922 struct tcp_sock *tp = tcp_sk(sk);
1924 tcp_clear_xmit_timers(sk);
1926 tcp_cleanup_congestion_control(sk);
1928 /* Cleanup up the write buffer. */
1929 tcp_write_queue_purge(sk);
1931 /* Cleans up our, hopefully empty, out_of_order_queue. */
1932 __skb_queue_purge(&tp->out_of_order_queue);
1934 #ifdef CONFIG_TCP_MD5SIG
1935 /* Clean up the MD5 key list, if any */
1936 if (tp->md5sig_info) {
1937 tcp_v4_clear_md5_list(sk);
1938 kfree(tp->md5sig_info);
1939 tp->md5sig_info = NULL;
1941 #endif
1943 #ifdef CONFIG_NET_DMA
1944 /* Cleans up our sk_async_wait_queue */
1945 __skb_queue_purge(&sk->sk_async_wait_queue);
1946 #endif
1948 /* Clean prequeue, it must be empty really */
1949 __skb_queue_purge(&tp->ucopy.prequeue);
1951 /* Clean up a referenced TCP bind bucket. */
1952 if (inet_csk(sk)->icsk_bind_hash)
1953 inet_put_port(sk);
1956 * If sendmsg cached page exists, toss it.
1958 if (sk->sk_sndmsg_page) {
1959 __free_page(sk->sk_sndmsg_page);
1960 sk->sk_sndmsg_page = NULL;
1963 /* TCP Cookie Transactions */
1964 if (tp->cookie_values != NULL) {
1965 kref_put(&tp->cookie_values->kref,
1966 tcp_cookie_values_release);
1967 tp->cookie_values = NULL;
1970 percpu_counter_dec(&tcp_sockets_allocated);
1972 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1974 #ifdef CONFIG_PROC_FS
1975 /* Proc filesystem TCP sock list dumping. */
1977 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1979 return hlist_nulls_empty(head) ? NULL :
1980 list_entry(head->first, struct inet_timewait_sock, tw_node);
1983 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1985 return !is_a_nulls(tw->tw_node.next) ?
1986 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1990 * Get next listener socket follow cur. If cur is NULL, get first socket
1991 * starting from bucket given in st->bucket; when st->bucket is zero the
1992 * very first socket in the hash table is returned.
1994 static void *listening_get_next(struct seq_file *seq, void *cur)
1996 struct inet_connection_sock *icsk;
1997 struct hlist_nulls_node *node;
1998 struct sock *sk = cur;
1999 struct inet_listen_hashbucket *ilb;
2000 struct tcp_iter_state *st = seq->private;
2001 struct net *net = seq_file_net(seq);
2003 if (!sk) {
2004 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2005 spin_lock_bh(&ilb->lock);
2006 sk = sk_nulls_head(&ilb->head);
2007 st->offset = 0;
2008 goto get_sk;
2010 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2011 ++st->num;
2012 ++st->offset;
2014 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2015 struct request_sock *req = cur;
2017 icsk = inet_csk(st->syn_wait_sk);
2018 req = req->dl_next;
2019 while (1) {
2020 while (req) {
2021 if (req->rsk_ops->family == st->family) {
2022 cur = req;
2023 goto out;
2025 req = req->dl_next;
2027 st->offset = 0;
2028 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2029 break;
2030 get_req:
2031 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2033 sk = sk_next(st->syn_wait_sk);
2034 st->state = TCP_SEQ_STATE_LISTENING;
2035 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036 } else {
2037 icsk = inet_csk(sk);
2038 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2040 goto start_req;
2041 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2042 sk = sk_next(sk);
2044 get_sk:
2045 sk_nulls_for_each_from(sk, node) {
2046 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2047 cur = sk;
2048 goto out;
2050 icsk = inet_csk(sk);
2051 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2052 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2053 start_req:
2054 st->uid = sock_i_uid(sk);
2055 st->syn_wait_sk = sk;
2056 st->state = TCP_SEQ_STATE_OPENREQ;
2057 st->sbucket = 0;
2058 goto get_req;
2060 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2062 spin_unlock_bh(&ilb->lock);
2063 st->offset = 0;
2064 if (++st->bucket < INET_LHTABLE_SIZE) {
2065 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2066 spin_lock_bh(&ilb->lock);
2067 sk = sk_nulls_head(&ilb->head);
2068 goto get_sk;
2070 cur = NULL;
2071 out:
2072 return cur;
2075 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2077 struct tcp_iter_state *st = seq->private;
2078 void *rc;
2080 st->bucket = 0;
2081 st->offset = 0;
2082 rc = listening_get_next(seq, NULL);
2084 while (rc && *pos) {
2085 rc = listening_get_next(seq, rc);
2086 --*pos;
2088 return rc;
2091 static inline int empty_bucket(struct tcp_iter_state *st)
2093 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2094 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2098 * Get first established socket starting from bucket given in st->bucket.
2099 * If st->bucket is zero, the very first socket in the hash is returned.
2101 static void *established_get_first(struct seq_file *seq)
2103 struct tcp_iter_state *st = seq->private;
2104 struct net *net = seq_file_net(seq);
2105 void *rc = NULL;
2107 st->offset = 0;
2108 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2109 struct sock *sk;
2110 struct hlist_nulls_node *node;
2111 struct inet_timewait_sock *tw;
2112 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2114 /* Lockless fast path for the common case of empty buckets */
2115 if (empty_bucket(st))
2116 continue;
2118 spin_lock_bh(lock);
2119 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2120 if (sk->sk_family != st->family ||
2121 !net_eq(sock_net(sk), net)) {
2122 continue;
2124 rc = sk;
2125 goto out;
2127 st->state = TCP_SEQ_STATE_TIME_WAIT;
2128 inet_twsk_for_each(tw, node,
2129 &tcp_hashinfo.ehash[st->bucket].twchain) {
2130 if (tw->tw_family != st->family ||
2131 !net_eq(twsk_net(tw), net)) {
2132 continue;
2134 rc = tw;
2135 goto out;
2137 spin_unlock_bh(lock);
2138 st->state = TCP_SEQ_STATE_ESTABLISHED;
2140 out:
2141 return rc;
2144 static void *established_get_next(struct seq_file *seq, void *cur)
2146 struct sock *sk = cur;
2147 struct inet_timewait_sock *tw;
2148 struct hlist_nulls_node *node;
2149 struct tcp_iter_state *st = seq->private;
2150 struct net *net = seq_file_net(seq);
2152 ++st->num;
2153 ++st->offset;
2155 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2156 tw = cur;
2157 tw = tw_next(tw);
2158 get_tw:
2159 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2160 tw = tw_next(tw);
2162 if (tw) {
2163 cur = tw;
2164 goto out;
2166 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2167 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169 /* Look for next non empty bucket */
2170 st->offset = 0;
2171 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2172 empty_bucket(st))
2174 if (st->bucket > tcp_hashinfo.ehash_mask)
2175 return NULL;
2177 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2178 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2179 } else
2180 sk = sk_nulls_next(sk);
2182 sk_nulls_for_each_from(sk, node) {
2183 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2184 goto found;
2187 st->state = TCP_SEQ_STATE_TIME_WAIT;
2188 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2189 goto get_tw;
2190 found:
2191 cur = sk;
2192 out:
2193 return cur;
2196 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2198 struct tcp_iter_state *st = seq->private;
2199 void *rc;
2201 st->bucket = 0;
2202 rc = established_get_first(seq);
2204 while (rc && pos) {
2205 rc = established_get_next(seq, rc);
2206 --pos;
2208 return rc;
2211 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2213 void *rc;
2214 struct tcp_iter_state *st = seq->private;
2216 st->state = TCP_SEQ_STATE_LISTENING;
2217 rc = listening_get_idx(seq, &pos);
2219 if (!rc) {
2220 st->state = TCP_SEQ_STATE_ESTABLISHED;
2221 rc = established_get_idx(seq, pos);
2224 return rc;
2227 static void *tcp_seek_last_pos(struct seq_file *seq)
2229 struct tcp_iter_state *st = seq->private;
2230 int offset = st->offset;
2231 int orig_num = st->num;
2232 void *rc = NULL;
2234 switch (st->state) {
2235 case TCP_SEQ_STATE_OPENREQ:
2236 case TCP_SEQ_STATE_LISTENING:
2237 if (st->bucket >= INET_LHTABLE_SIZE)
2238 break;
2239 st->state = TCP_SEQ_STATE_LISTENING;
2240 rc = listening_get_next(seq, NULL);
2241 while (offset-- && rc)
2242 rc = listening_get_next(seq, rc);
2243 if (rc)
2244 break;
2245 st->bucket = 0;
2246 /* Fallthrough */
2247 case TCP_SEQ_STATE_ESTABLISHED:
2248 case TCP_SEQ_STATE_TIME_WAIT:
2249 st->state = TCP_SEQ_STATE_ESTABLISHED;
2250 if (st->bucket > tcp_hashinfo.ehash_mask)
2251 break;
2252 rc = established_get_first(seq);
2253 while (offset-- && rc)
2254 rc = established_get_next(seq, rc);
2257 st->num = orig_num;
2259 return rc;
2262 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2264 struct tcp_iter_state *st = seq->private;
2265 void *rc;
2267 if (*pos && *pos == st->last_pos) {
2268 rc = tcp_seek_last_pos(seq);
2269 if (rc)
2270 goto out;
2273 st->state = TCP_SEQ_STATE_LISTENING;
2274 st->num = 0;
2275 st->bucket = 0;
2276 st->offset = 0;
2277 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2279 out:
2280 st->last_pos = *pos;
2281 return rc;
2284 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2286 struct tcp_iter_state *st = seq->private;
2287 void *rc = NULL;
2289 if (v == SEQ_START_TOKEN) {
2290 rc = tcp_get_idx(seq, 0);
2291 goto out;
2294 switch (st->state) {
2295 case TCP_SEQ_STATE_OPENREQ:
2296 case TCP_SEQ_STATE_LISTENING:
2297 rc = listening_get_next(seq, v);
2298 if (!rc) {
2299 st->state = TCP_SEQ_STATE_ESTABLISHED;
2300 st->bucket = 0;
2301 st->offset = 0;
2302 rc = established_get_first(seq);
2304 break;
2305 case TCP_SEQ_STATE_ESTABLISHED:
2306 case TCP_SEQ_STATE_TIME_WAIT:
2307 rc = established_get_next(seq, v);
2308 break;
2310 out:
2311 ++*pos;
2312 st->last_pos = *pos;
2313 return rc;
2316 static void tcp_seq_stop(struct seq_file *seq, void *v)
2318 struct tcp_iter_state *st = seq->private;
2320 switch (st->state) {
2321 case TCP_SEQ_STATE_OPENREQ:
2322 if (v) {
2323 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2324 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2326 case TCP_SEQ_STATE_LISTENING:
2327 if (v != SEQ_START_TOKEN)
2328 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2329 break;
2330 case TCP_SEQ_STATE_TIME_WAIT:
2331 case TCP_SEQ_STATE_ESTABLISHED:
2332 if (v)
2333 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2334 break;
2338 static int tcp_seq_open(struct inode *inode, struct file *file)
2340 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2341 struct tcp_iter_state *s;
2342 int err;
2344 err = seq_open_net(inode, file, &afinfo->seq_ops,
2345 sizeof(struct tcp_iter_state));
2346 if (err < 0)
2347 return err;
2349 s = ((struct seq_file *)file->private_data)->private;
2350 s->family = afinfo->family;
2351 s->last_pos = 0;
2352 return 0;
2355 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2357 int rc = 0;
2358 struct proc_dir_entry *p;
2360 afinfo->seq_fops.open = tcp_seq_open;
2361 afinfo->seq_fops.read = seq_read;
2362 afinfo->seq_fops.llseek = seq_lseek;
2363 afinfo->seq_fops.release = seq_release_net;
2365 afinfo->seq_ops.start = tcp_seq_start;
2366 afinfo->seq_ops.next = tcp_seq_next;
2367 afinfo->seq_ops.stop = tcp_seq_stop;
2369 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2370 &afinfo->seq_fops, afinfo);
2371 if (!p)
2372 rc = -ENOMEM;
2373 return rc;
2375 EXPORT_SYMBOL(tcp_proc_register);
2377 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2379 proc_net_remove(net, afinfo->name);
2381 EXPORT_SYMBOL(tcp_proc_unregister);
2383 static void get_openreq4(struct sock *sk, struct request_sock *req,
2384 struct seq_file *f, int i, int uid, int *len)
2386 const struct inet_request_sock *ireq = inet_rsk(req);
2387 int ttd = req->expires - jiffies;
2389 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2390 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2392 ireq->loc_addr,
2393 ntohs(inet_sk(sk)->inet_sport),
2394 ireq->rmt_addr,
2395 ntohs(ireq->rmt_port),
2396 TCP_SYN_RECV,
2397 0, 0, /* could print option size, but that is af dependent. */
2398 1, /* timers active (only the expire timer) */
2399 jiffies_to_clock_t(ttd),
2400 req->retrans,
2401 uid,
2402 0, /* non standard timer */
2403 0, /* open_requests have no inode */
2404 atomic_read(&sk->sk_refcnt),
2405 req,
2406 len);
2409 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2411 int timer_active;
2412 unsigned long timer_expires;
2413 struct tcp_sock *tp = tcp_sk(sk);
2414 const struct inet_connection_sock *icsk = inet_csk(sk);
2415 struct inet_sock *inet = inet_sk(sk);
2416 __be32 dest = inet->inet_daddr;
2417 __be32 src = inet->inet_rcv_saddr;
2418 __u16 destp = ntohs(inet->inet_dport);
2419 __u16 srcp = ntohs(inet->inet_sport);
2420 int rx_queue;
2422 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2423 timer_active = 1;
2424 timer_expires = icsk->icsk_timeout;
2425 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2426 timer_active = 4;
2427 timer_expires = icsk->icsk_timeout;
2428 } else if (timer_pending(&sk->sk_timer)) {
2429 timer_active = 2;
2430 timer_expires = sk->sk_timer.expires;
2431 } else {
2432 timer_active = 0;
2433 timer_expires = jiffies;
2436 if (sk->sk_state == TCP_LISTEN)
2437 rx_queue = sk->sk_ack_backlog;
2438 else
2440 * because we dont lock socket, we might find a transient negative value
2442 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2444 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2445 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2446 i, src, srcp, dest, destp, sk->sk_state,
2447 tp->write_seq - tp->snd_una,
2448 rx_queue,
2449 timer_active,
2450 jiffies_to_clock_t(timer_expires - jiffies),
2451 icsk->icsk_retransmits,
2452 sock_i_uid(sk),
2453 icsk->icsk_probes_out,
2454 sock_i_ino(sk),
2455 atomic_read(&sk->sk_refcnt), sk,
2456 jiffies_to_clock_t(icsk->icsk_rto),
2457 jiffies_to_clock_t(icsk->icsk_ack.ato),
2458 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2459 tp->snd_cwnd,
2460 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2461 len);
2464 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2465 struct seq_file *f, int i, int *len)
2467 __be32 dest, src;
2468 __u16 destp, srcp;
2469 int ttd = tw->tw_ttd - jiffies;
2471 if (ttd < 0)
2472 ttd = 0;
2474 dest = tw->tw_daddr;
2475 src = tw->tw_rcv_saddr;
2476 destp = ntohs(tw->tw_dport);
2477 srcp = ntohs(tw->tw_sport);
2479 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2480 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2481 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2482 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2483 atomic_read(&tw->tw_refcnt), tw, len);
2486 #define TMPSZ 150
2488 static int tcp4_seq_show(struct seq_file *seq, void *v)
2490 struct tcp_iter_state *st;
2491 int len;
2493 if (v == SEQ_START_TOKEN) {
2494 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2495 " sl local_address rem_address st tx_queue "
2496 "rx_queue tr tm->when retrnsmt uid timeout "
2497 "inode");
2498 goto out;
2500 st = seq->private;
2502 switch (st->state) {
2503 case TCP_SEQ_STATE_LISTENING:
2504 case TCP_SEQ_STATE_ESTABLISHED:
2505 get_tcp4_sock(v, seq, st->num, &len);
2506 break;
2507 case TCP_SEQ_STATE_OPENREQ:
2508 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2509 break;
2510 case TCP_SEQ_STATE_TIME_WAIT:
2511 get_timewait4_sock(v, seq, st->num, &len);
2512 break;
2514 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2515 out:
2516 return 0;
2519 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2520 .name = "tcp",
2521 .family = AF_INET,
2522 .seq_fops = {
2523 .owner = THIS_MODULE,
2525 .seq_ops = {
2526 .show = tcp4_seq_show,
2530 static int __net_init tcp4_proc_init_net(struct net *net)
2532 return tcp_proc_register(net, &tcp4_seq_afinfo);
2535 static void __net_exit tcp4_proc_exit_net(struct net *net)
2537 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2540 static struct pernet_operations tcp4_net_ops = {
2541 .init = tcp4_proc_init_net,
2542 .exit = tcp4_proc_exit_net,
2545 int __init tcp4_proc_init(void)
2547 return register_pernet_subsys(&tcp4_net_ops);
2550 void tcp4_proc_exit(void)
2552 unregister_pernet_subsys(&tcp4_net_ops);
2554 #endif /* CONFIG_PROC_FS */
2556 #ifdef CONFIG_INET_GRO
2557 extern atomic_t gro_timer_init;
2558 #endif
2559 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2561 struct iphdr *iph;
2563 #ifdef CONFIG_INET_GRO
2564 if (atomic_read(&gro_timer_init))
2565 return tcp_gro_receive(head, skb);
2566 #else
2567 /* We don't support hw-checksum. Skip this part to do real TCP merge */
2568 iph = skb_gro_network_header(skb);
2569 switch (skb->ip_summed) {
2570 case CHECKSUM_COMPLETE:
2571 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2572 skb->csum)) {
2573 skb->ip_summed = CHECKSUM_UNNECESSARY;
2574 break;
2577 /* fall through */
2578 case CHECKSUM_NONE:
2579 NAPI_GRO_CB(skb)->flush = 1;
2580 return NULL;
2583 return tcp_gro_receive(head, skb);
2584 #endif /* CONFIG_INET_GRO */
2586 EXPORT_SYMBOL(tcp4_gro_receive);
2588 int BCMFASTPATH_HOST tcp4_gro_complete(struct sk_buff *skb)
2590 struct iphdr *iph = ip_hdr(skb);
2591 struct tcphdr *th = tcp_hdr(skb);
2593 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2594 iph->saddr, iph->daddr, 0);
2595 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2597 return tcp_gro_complete(skb);
2599 EXPORT_SYMBOL(tcp4_gro_complete);
2601 struct proto tcp_prot = {
2602 .name = "TCP",
2603 .owner = THIS_MODULE,
2604 .close = tcp_close,
2605 .connect = tcp_v4_connect,
2606 .disconnect = tcp_disconnect,
2607 .accept = inet_csk_accept,
2608 .ioctl = tcp_ioctl,
2609 .init = tcp_v4_init_sock,
2610 .destroy = tcp_v4_destroy_sock,
2611 .shutdown = tcp_shutdown,
2612 .setsockopt = tcp_setsockopt,
2613 .getsockopt = tcp_getsockopt,
2614 .recvmsg = tcp_recvmsg,
2615 .sendmsg = tcp_sendmsg,
2616 .sendpage = tcp_sendpage,
2617 .backlog_rcv = tcp_v4_do_rcv,
2618 .hash = inet_hash,
2619 .unhash = inet_unhash,
2620 .get_port = inet_csk_get_port,
2621 .enter_memory_pressure = tcp_enter_memory_pressure,
2622 .sockets_allocated = &tcp_sockets_allocated,
2623 .orphan_count = &tcp_orphan_count,
2624 .memory_allocated = &tcp_memory_allocated,
2625 .memory_pressure = &tcp_memory_pressure,
2626 .sysctl_mem = sysctl_tcp_mem,
2627 .sysctl_wmem = sysctl_tcp_wmem,
2628 .sysctl_rmem = sysctl_tcp_rmem,
2629 .max_header = MAX_TCP_HEADER,
2630 .obj_size = sizeof(struct tcp_sock),
2631 .slab_flags = SLAB_DESTROY_BY_RCU,
2632 .twsk_prot = &tcp_timewait_sock_ops,
2633 .rsk_prot = &tcp_request_sock_ops,
2634 .h.hashinfo = &tcp_hashinfo,
2635 .no_autobind = true,
2636 #ifdef CONFIG_COMPAT
2637 .compat_setsockopt = compat_tcp_setsockopt,
2638 .compat_getsockopt = compat_tcp_getsockopt,
2639 #endif
2641 EXPORT_SYMBOL(tcp_prot);
2644 static int __net_init tcp_sk_init(struct net *net)
2646 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2647 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2650 static void __net_exit tcp_sk_exit(struct net *net)
2652 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2655 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2657 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2660 static struct pernet_operations __net_initdata tcp_sk_ops = {
2661 .init = tcp_sk_init,
2662 .exit = tcp_sk_exit,
2663 .exit_batch = tcp_sk_exit_batch,
2666 void __init tcp_v4_init(void)
2668 inet_hashinfo_init(&tcp_hashinfo);
2669 if (register_pernet_subsys(&tcp_sk_ops))
2670 panic("Failed to create the TCP control socket.\n");