1 /* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
14 * linux/ipv4/tcp_input.c
15 * linux/ipv4/tcp_output.c
17 * See tcp.c for author information
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
27 * David S. Miller : New socket lookup architecture.
28 * This code is dedicated to John Dyson.
29 * David S. Miller : Change semantics of established hash,
30 * half is devoted to TIME_WAIT sockets
31 * and the rest go in the other half.
32 * Andi Kleen : Add support for syncookies and fixed
33 * some bugs: ip options weren't passed to
34 * the TCP layer, missed a check for an
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * request_sock handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen semantics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * Andi Kleen : Fix new listen.
48 * Andi Kleen : Fix accept error reporting.
49 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
50 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
51 * a single port at the same time.
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
89 int sysctl_tcp_tw_reuse __read_mostly
;
90 int sysctl_tcp_low_latency __read_mostly
;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency
);
94 #ifdef CONFIG_TCP_MD5SIG
95 static struct tcp_md5sig_key
*tcp_v4_md5_do_lookup(struct sock
*sk
,
97 static int tcp_v4_md5_hash_hdr(char *md5_hash
, struct tcp_md5sig_key
*key
,
98 __be32 daddr
, __be32 saddr
, struct tcphdr
*th
);
101 struct tcp_md5sig_key
*tcp_v4_md5_do_lookup(struct sock
*sk
, __be32 addr
)
107 struct inet_hashinfo tcp_hashinfo
;
108 EXPORT_SYMBOL(tcp_hashinfo
);
110 static inline __u32
tcp_v4_init_sequence(struct sk_buff
*skb
)
112 return secure_tcp_sequence_number(ip_hdr(skb
)->daddr
,
115 tcp_hdr(skb
)->source
);
118 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
120 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
121 struct tcp_sock
*tp
= tcp_sk(sk
);
123 /* With PAWS, it is safe from the viewpoint
124 of data integrity. Even without PAWS it is safe provided sequence
125 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127 Actually, the idea is close to VJ's one, only timestamp cache is
128 held not per host, but per port pair and TW bucket is used as state
131 If TW bucket has been already destroyed we fall back to VJ's scheme
132 and use initial timestamp retrieved from peer table.
134 if (tcptw
->tw_ts_recent_stamp
&&
135 (twp
== NULL
|| (sysctl_tcp_tw_reuse
&&
136 get_seconds() - tcptw
->tw_ts_recent_stamp
> 1))) {
137 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
138 if (tp
->write_seq
== 0)
140 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
141 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
148 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
150 /* This will initiate an outgoing connection. */
151 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
153 struct inet_sock
*inet
= inet_sk(sk
);
154 struct tcp_sock
*tp
= tcp_sk(sk
);
155 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
157 __be32 daddr
, nexthop
;
161 if (addr_len
< sizeof(struct sockaddr_in
))
164 if (usin
->sin_family
!= AF_INET
)
165 return -EAFNOSUPPORT
;
167 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
168 if (inet
->opt
&& inet
->opt
->srr
) {
171 nexthop
= inet
->opt
->faddr
;
174 tmp
= ip_route_connect(&rt
, nexthop
, inet
->inet_saddr
,
175 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
177 inet
->inet_sport
, usin
->sin_port
, sk
, 1);
179 if (tmp
== -ENETUNREACH
)
180 IP_INC_STATS_BH(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
184 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
189 if (!inet
->opt
|| !inet
->opt
->srr
)
192 if (!inet
->inet_saddr
)
193 inet
->inet_saddr
= rt
->rt_src
;
194 inet
->inet_rcv_saddr
= inet
->inet_saddr
;
196 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
197 /* Reset inherited state */
198 tp
->rx_opt
.ts_recent
= 0;
199 tp
->rx_opt
.ts_recent_stamp
= 0;
203 if (tcp_death_row
.sysctl_tw_recycle
&&
204 !tp
->rx_opt
.ts_recent_stamp
&& rt
->rt_dst
== daddr
) {
205 struct inet_peer
*peer
= rt_get_peer(rt
);
207 * VJ's idea. We save last timestamp seen from
208 * the destination in peer table, when entering state
209 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
210 * when trying new connection.
213 inet_peer_refcheck(peer
);
214 if ((u32
)get_seconds() - peer
->tcp_ts_stamp
<= TCP_PAWS_MSL
) {
215 tp
->rx_opt
.ts_recent_stamp
= peer
->tcp_ts_stamp
;
216 tp
->rx_opt
.ts_recent
= peer
->tcp_ts
;
221 inet
->inet_dport
= usin
->sin_port
;
222 inet
->inet_daddr
= daddr
;
224 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
226 inet_csk(sk
)->icsk_ext_hdr_len
= inet
->opt
->optlen
;
228 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
230 /* Socket identity is still unknown (sport may be zero).
231 * However we set state to SYN-SENT and not releasing socket
232 * lock select source port, enter ourselves into the hash tables and
233 * complete initialization after this.
235 tcp_set_state(sk
, TCP_SYN_SENT
);
236 err
= inet_hash_connect(&tcp_death_row
, sk
);
240 err
= ip_route_newports(&rt
, IPPROTO_TCP
,
241 inet
->inet_sport
, inet
->inet_dport
, sk
);
245 /* OK, now commit destination to socket. */
246 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
247 sk_setup_caps(sk
, &rt
->dst
);
250 tp
->write_seq
= secure_tcp_sequence_number(inet
->inet_saddr
,
255 inet
->inet_id
= tp
->write_seq
^ jiffies
;
257 err
= tcp_connect(sk
);
266 * This unhashes the socket and releases the local port,
269 tcp_set_state(sk
, TCP_CLOSE
);
271 sk
->sk_route_caps
= 0;
272 inet
->inet_dport
= 0;
275 EXPORT_SYMBOL(tcp_v4_connect
);
278 * This routine does path mtu discovery as defined in RFC1191.
280 static void do_pmtu_discovery(struct sock
*sk
, struct iphdr
*iph
, u32 mtu
)
282 struct dst_entry
*dst
;
283 struct inet_sock
*inet
= inet_sk(sk
);
285 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286 * send out by Linux are always <576bytes so they should go through
289 if (sk
->sk_state
== TCP_LISTEN
)
292 /* We don't check in the destentry if pmtu discovery is forbidden
293 * on this route. We just assume that no packet_to_big packets
294 * are send back when pmtu discovery is not active.
295 * There is a small race when the user changes this flag in the
296 * route, but I think that's acceptable.
298 if ((dst
= __sk_dst_check(sk
, 0)) == NULL
)
301 dst
->ops
->update_pmtu(dst
, mtu
);
303 /* Something is about to be wrong... Remember soft error
304 * for the case, if this connection will not able to recover.
306 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
307 sk
->sk_err_soft
= EMSGSIZE
;
311 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
312 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
313 tcp_sync_mss(sk
, mtu
);
315 /* Resend the TCP packet because it's
316 * clear that the old packet has been
317 * dropped. This is the new "fast" path mtu
320 tcp_simple_retransmit(sk
);
321 } /* else let the usual retransmit timer handle it */
325 * This routine is called by the ICMP module when it gets some
326 * sort of error condition. If err < 0 then the socket should
327 * be closed and the error returned to the user. If err > 0
328 * it's just the icmp type << 8 | icmp code. After adjustment
329 * header points to the first 8 bytes of the tcp header. We need
330 * to find the appropriate port.
332 * The locking strategy used here is very "optimistic". When
333 * someone else accesses the socket the ICMP is just dropped
334 * and for some paths there is no check at all.
335 * A more general error queue to queue errors for later handling
336 * is probably better.
340 void tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
342 struct iphdr
*iph
= (struct iphdr
*)icmp_skb
->data
;
343 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
344 struct inet_connection_sock
*icsk
;
346 struct inet_sock
*inet
;
347 const int type
= icmp_hdr(icmp_skb
)->type
;
348 const int code
= icmp_hdr(icmp_skb
)->code
;
354 struct net
*net
= dev_net(icmp_skb
->dev
);
356 if (icmp_skb
->len
< (iph
->ihl
<< 2) + 8) {
357 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
361 sk
= inet_lookup(net
, &tcp_hashinfo
, iph
->daddr
, th
->dest
,
362 iph
->saddr
, th
->source
, inet_iif(icmp_skb
));
364 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
367 if (sk
->sk_state
== TCP_TIME_WAIT
) {
368 inet_twsk_put(inet_twsk(sk
));
373 /* If too many ICMPs get dropped on busy
374 * servers this needs to be solved differently.
376 if (sock_owned_by_user(sk
))
377 NET_INC_STATS_BH(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
379 if (sk
->sk_state
== TCP_CLOSE
)
382 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
383 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
389 seq
= ntohl(th
->seq
);
390 if (sk
->sk_state
!= TCP_LISTEN
&&
391 !between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
392 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
397 case ICMP_SOURCE_QUENCH
:
398 /* Just silently ignore these. */
400 case ICMP_PARAMETERPROB
:
403 case ICMP_DEST_UNREACH
:
404 if (code
> NR_ICMP_UNREACH
)
407 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
408 if (!sock_owned_by_user(sk
))
409 do_pmtu_discovery(sk
, iph
, info
);
413 err
= icmp_err_convert
[code
].errno
;
414 /* check if icmp_skb allows revert of backoff
415 * (see draft-zimmermann-tcp-lcd) */
416 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
418 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
422 if (sock_owned_by_user(sk
))
425 icsk
->icsk_backoff
--;
426 inet_csk(sk
)->icsk_rto
= __tcp_set_rto(tp
) <<
430 skb
= tcp_write_queue_head(sk
);
433 remaining
= icsk
->icsk_rto
- min(icsk
->icsk_rto
,
434 tcp_time_stamp
- TCP_SKB_CB(skb
)->when
);
437 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
438 remaining
, TCP_RTO_MAX
);
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now */
442 tcp_retransmit_timer(sk
);
446 case ICMP_TIME_EXCEEDED
:
453 switch (sk
->sk_state
) {
454 struct request_sock
*req
, **prev
;
456 if (sock_owned_by_user(sk
))
459 req
= inet_csk_search_req(sk
, &prev
, th
->dest
,
460 iph
->daddr
, iph
->saddr
);
464 /* ICMPs are not backlogged, hence we cannot get
465 an established socket here.
469 if (seq
!= tcp_rsk(req
)->snt_isn
) {
470 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
475 * Still in SYN_RECV, just remove it silently.
476 * There is no good way to pass the error to the newly
477 * created socket, and POSIX does not want network
478 * errors returned from accept().
480 inet_csk_reqsk_queue_drop(sk
, req
, prev
);
484 case TCP_SYN_RECV
: /* Cannot happen.
485 It can f.e. if SYNs crossed.
487 if (!sock_owned_by_user(sk
)) {
490 sk
->sk_error_report(sk
);
494 sk
->sk_err_soft
= err
;
499 /* If we've already connected we will keep trying
500 * until we time out, or the user gives up.
502 * rfc1122 4.2.3.9 allows to consider as hard errors
503 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504 * but it is obsoleted by pmtu discovery).
506 * Note, that in modern internet, where routing is unreliable
507 * and in each dark corner broken firewalls sit, sending random
508 * errors ordered by their masters even this two messages finally lose
509 * their original sense (even Linux sends invalid PORT_UNREACHs)
511 * Now we are in compliance with RFCs.
516 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
518 sk
->sk_error_report(sk
);
519 } else { /* Only an error on timeout */
520 sk
->sk_err_soft
= err
;
528 static void __tcp_v4_send_check(struct sk_buff
*skb
,
529 __be32 saddr
, __be32 daddr
)
531 struct tcphdr
*th
= tcp_hdr(skb
);
533 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
534 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
535 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
536 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
538 th
->check
= tcp_v4_check(skb
->len
, saddr
, daddr
,
545 /* This routine computes an IPv4 TCP checksum. */
546 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
548 struct inet_sock
*inet
= inet_sk(sk
);
550 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
552 EXPORT_SYMBOL(tcp_v4_send_check
);
554 int tcp_v4_gso_send_check(struct sk_buff
*skb
)
556 const struct iphdr
*iph
;
559 if (!pskb_may_pull(skb
, sizeof(*th
)))
566 skb
->ip_summed
= CHECKSUM_PARTIAL
;
567 __tcp_v4_send_check(skb
, iph
->saddr
, iph
->daddr
);
572 * This routine will send an RST to the other tcp.
574 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
576 * Answer: if a packet caused RST, it is not for a socket
577 * existing in our system, if it is matched to a socket,
578 * it is just duplicate segment or bug in other side's TCP.
579 * So that we build reply only basing on parameters
580 * arrived with segment.
581 * Exception: precedence violation. We do not implement it in any case.
584 static void tcp_v4_send_reset(struct sock
*sk
, struct sk_buff
*skb
)
586 struct tcphdr
*th
= tcp_hdr(skb
);
589 #ifdef CONFIG_TCP_MD5SIG
590 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
593 struct ip_reply_arg arg
;
594 #ifdef CONFIG_TCP_MD5SIG
595 struct tcp_md5sig_key
*key
;
599 /* Never send a reset in response to a reset. */
603 if (skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
606 /* Swap the send and the receive. */
607 memset(&rep
, 0, sizeof(rep
));
608 rep
.th
.dest
= th
->source
;
609 rep
.th
.source
= th
->dest
;
610 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
614 rep
.th
.seq
= th
->ack_seq
;
617 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
618 skb
->len
- (th
->doff
<< 2));
621 memset(&arg
, 0, sizeof(arg
));
622 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
623 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
625 #ifdef CONFIG_TCP_MD5SIG
626 key
= sk
? tcp_v4_md5_do_lookup(sk
, ip_hdr(skb
)->daddr
) : NULL
;
628 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
630 (TCPOPT_MD5SIG
<< 8) |
632 /* Update length and the length the header thinks exists */
633 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
634 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
636 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
637 key
, ip_hdr(skb
)->saddr
,
638 ip_hdr(skb
)->daddr
, &rep
.th
);
641 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
643 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
644 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
645 arg
.flags
= (sk
&& inet_sk(sk
)->transparent
) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
647 net
= dev_net(skb_dst(skb
)->dev
);
648 ip_send_reply(net
->ipv4
.tcp_sock
, skb
,
649 &arg
, arg
.iov
[0].iov_len
);
651 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
652 TCP_INC_STATS_BH(net
, TCP_MIB_OUTRSTS
);
655 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
656 outside socket context is ugly, certainly. What can I do?
659 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
,
660 u32 win
, u32 ts
, int oif
,
661 struct tcp_md5sig_key
*key
,
664 struct tcphdr
*th
= tcp_hdr(skb
);
667 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
668 #ifdef CONFIG_TCP_MD5SIG
669 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
673 struct ip_reply_arg arg
;
674 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
676 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
677 memset(&arg
, 0, sizeof(arg
));
679 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
680 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
682 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
683 (TCPOPT_TIMESTAMP
<< 8) |
685 rep
.opt
[1] = htonl(tcp_time_stamp
);
686 rep
.opt
[2] = htonl(ts
);
687 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
690 /* Swap the send and the receive. */
691 rep
.th
.dest
= th
->source
;
692 rep
.th
.source
= th
->dest
;
693 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
694 rep
.th
.seq
= htonl(seq
);
695 rep
.th
.ack_seq
= htonl(ack
);
697 rep
.th
.window
= htons(win
);
699 #ifdef CONFIG_TCP_MD5SIG
701 int offset
= (ts
) ? 3 : 0;
703 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
705 (TCPOPT_MD5SIG
<< 8) |
707 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
708 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
710 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
711 key
, ip_hdr(skb
)->saddr
,
712 ip_hdr(skb
)->daddr
, &rep
.th
);
715 arg
.flags
= reply_flags
;
716 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
718 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
719 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
721 arg
.bound_dev_if
= oif
;
723 ip_send_reply(net
->ipv4
.tcp_sock
, skb
,
724 &arg
, arg
.iov
[0].iov_len
);
726 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
729 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
731 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
732 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
734 tcp_v4_send_ack(skb
, tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
735 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
738 tcp_twsk_md5_key(tcptw
),
739 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0
745 static void tcp_v4_reqsk_send_ack(struct sock
*sk
, struct sk_buff
*skb
,
746 struct request_sock
*req
)
748 tcp_v4_send_ack(skb
, tcp_rsk(req
)->snt_isn
+ 1,
749 tcp_rsk(req
)->rcv_isn
+ 1, req
->rcv_wnd
,
752 tcp_v4_md5_do_lookup(sk
, ip_hdr(skb
)->daddr
),
753 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0);
757 * Send a SYN-ACK after having received a SYN.
758 * This still operates on a request_sock only, not on a big
761 static int tcp_v4_send_synack(struct sock
*sk
, struct dst_entry
*dst
,
762 struct request_sock
*req
,
763 struct request_values
*rvp
)
765 const struct inet_request_sock
*ireq
= inet_rsk(req
);
767 struct sk_buff
* skb
;
769 /* First, grab a route. */
770 if (!dst
&& (dst
= inet_csk_route_req(sk
, req
)) == NULL
)
773 skb
= tcp_make_synack(sk
, dst
, req
, rvp
);
776 __tcp_v4_send_check(skb
, ireq
->loc_addr
, ireq
->rmt_addr
);
778 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->loc_addr
,
781 err
= net_xmit_eval(err
);
788 static int tcp_v4_rtx_synack(struct sock
*sk
, struct request_sock
*req
,
789 struct request_values
*rvp
)
791 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_RETRANSSEGS
);
792 return tcp_v4_send_synack(sk
, NULL
, req
, rvp
);
796 * IPv4 request_sock destructor.
798 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
800 kfree(inet_rsk(req
)->opt
);
803 static void syn_flood_warning(const struct sk_buff
*skb
)
807 #ifdef CONFIG_SYN_COOKIES
808 if (sysctl_tcp_syncookies
)
809 msg
= "Sending cookies";
812 msg
= "Dropping request";
814 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
815 ntohs(tcp_hdr(skb
)->dest
), msg
);
819 * Save and compile IPv4 options into the request_sock if needed.
821 static struct ip_options
*tcp_v4_save_options(struct sock
*sk
,
824 struct ip_options
*opt
= &(IPCB(skb
)->opt
);
825 struct ip_options
*dopt
= NULL
;
827 if (opt
&& opt
->optlen
) {
828 int opt_size
= optlength(opt
);
829 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
831 if (ip_options_echo(dopt
, skb
)) {
840 #ifdef CONFIG_TCP_MD5SIG
842 * RFC2385 MD5 checksumming requires a mapping of
843 * IP address->MD5 Key.
844 * We need to maintain these in the sk structure.
847 /* Find the Key structure for an address. */
848 static struct tcp_md5sig_key
*
849 tcp_v4_md5_do_lookup(struct sock
*sk
, __be32 addr
)
851 struct tcp_sock
*tp
= tcp_sk(sk
);
854 if (!tp
->md5sig_info
|| !tp
->md5sig_info
->entries4
)
856 for (i
= 0; i
< tp
->md5sig_info
->entries4
; i
++) {
857 if (tp
->md5sig_info
->keys4
[i
].addr
== addr
)
858 return &tp
->md5sig_info
->keys4
[i
].base
;
863 struct tcp_md5sig_key
*tcp_v4_md5_lookup(struct sock
*sk
,
864 struct sock
*addr_sk
)
866 return tcp_v4_md5_do_lookup(sk
, inet_sk(addr_sk
)->inet_daddr
);
868 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
870 static struct tcp_md5sig_key
*tcp_v4_reqsk_md5_lookup(struct sock
*sk
,
871 struct request_sock
*req
)
873 return tcp_v4_md5_do_lookup(sk
, inet_rsk(req
)->rmt_addr
);
876 /* This can be called on a newly created socket, from other files */
877 int tcp_v4_md5_do_add(struct sock
*sk
, __be32 addr
,
878 u8
*newkey
, u8 newkeylen
)
880 /* Add Key to the list */
881 struct tcp_md5sig_key
*key
;
882 struct tcp_sock
*tp
= tcp_sk(sk
);
883 struct tcp4_md5sig_key
*keys
;
885 key
= tcp_v4_md5_do_lookup(sk
, addr
);
887 /* Pre-existing entry - just update that one. */
890 key
->keylen
= newkeylen
;
892 struct tcp_md5sig_info
*md5sig
;
894 if (!tp
->md5sig_info
) {
895 tp
->md5sig_info
= kzalloc(sizeof(*tp
->md5sig_info
),
897 if (!tp
->md5sig_info
) {
901 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
903 if (tcp_alloc_md5sig_pool(sk
) == NULL
) {
907 md5sig
= tp
->md5sig_info
;
909 if (md5sig
->alloced4
== md5sig
->entries4
) {
910 keys
= kmalloc((sizeof(*keys
) *
911 (md5sig
->entries4
+ 1)), GFP_ATOMIC
);
914 tcp_free_md5sig_pool();
918 if (md5sig
->entries4
)
919 memcpy(keys
, md5sig
->keys4
,
920 sizeof(*keys
) * md5sig
->entries4
);
922 /* Free old key list, and reference new one */
923 kfree(md5sig
->keys4
);
924 md5sig
->keys4
= keys
;
928 md5sig
->keys4
[md5sig
->entries4
- 1].addr
= addr
;
929 md5sig
->keys4
[md5sig
->entries4
- 1].base
.key
= newkey
;
930 md5sig
->keys4
[md5sig
->entries4
- 1].base
.keylen
= newkeylen
;
934 EXPORT_SYMBOL(tcp_v4_md5_do_add
);
936 static int tcp_v4_md5_add_func(struct sock
*sk
, struct sock
*addr_sk
,
937 u8
*newkey
, u8 newkeylen
)
939 return tcp_v4_md5_do_add(sk
, inet_sk(addr_sk
)->inet_daddr
,
943 int tcp_v4_md5_do_del(struct sock
*sk
, __be32 addr
)
945 struct tcp_sock
*tp
= tcp_sk(sk
);
948 for (i
= 0; i
< tp
->md5sig_info
->entries4
; i
++) {
949 if (tp
->md5sig_info
->keys4
[i
].addr
== addr
) {
951 kfree(tp
->md5sig_info
->keys4
[i
].base
.key
);
952 tp
->md5sig_info
->entries4
--;
954 if (tp
->md5sig_info
->entries4
== 0) {
955 kfree(tp
->md5sig_info
->keys4
);
956 tp
->md5sig_info
->keys4
= NULL
;
957 tp
->md5sig_info
->alloced4
= 0;
958 } else if (tp
->md5sig_info
->entries4
!= i
) {
959 /* Need to do some manipulation */
960 memmove(&tp
->md5sig_info
->keys4
[i
],
961 &tp
->md5sig_info
->keys4
[i
+1],
962 (tp
->md5sig_info
->entries4
- i
) *
963 sizeof(struct tcp4_md5sig_key
));
965 tcp_free_md5sig_pool();
971 EXPORT_SYMBOL(tcp_v4_md5_do_del
);
973 static void tcp_v4_clear_md5_list(struct sock
*sk
)
975 struct tcp_sock
*tp
= tcp_sk(sk
);
977 /* Free each key, then the set of key keys,
978 * the crypto element, and then decrement our
979 * hold on the last resort crypto.
981 if (tp
->md5sig_info
->entries4
) {
983 for (i
= 0; i
< tp
->md5sig_info
->entries4
; i
++)
984 kfree(tp
->md5sig_info
->keys4
[i
].base
.key
);
985 tp
->md5sig_info
->entries4
= 0;
986 tcp_free_md5sig_pool();
988 if (tp
->md5sig_info
->keys4
) {
989 kfree(tp
->md5sig_info
->keys4
);
990 tp
->md5sig_info
->keys4
= NULL
;
991 tp
->md5sig_info
->alloced4
= 0;
995 static int tcp_v4_parse_md5_keys(struct sock
*sk
, char __user
*optval
,
998 struct tcp_md5sig cmd
;
999 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1002 if (optlen
< sizeof(cmd
))
1005 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1008 if (sin
->sin_family
!= AF_INET
)
1011 if (!cmd
.tcpm_key
|| !cmd
.tcpm_keylen
) {
1012 if (!tcp_sk(sk
)->md5sig_info
)
1014 return tcp_v4_md5_do_del(sk
, sin
->sin_addr
.s_addr
);
1017 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1020 if (!tcp_sk(sk
)->md5sig_info
) {
1021 struct tcp_sock
*tp
= tcp_sk(sk
);
1022 struct tcp_md5sig_info
*p
;
1024 p
= kzalloc(sizeof(*p
), sk
->sk_allocation
);
1028 tp
->md5sig_info
= p
;
1029 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1032 newkey
= kmemdup(cmd
.tcpm_key
, cmd
.tcpm_keylen
, sk
->sk_allocation
);
1035 return tcp_v4_md5_do_add(sk
, sin
->sin_addr
.s_addr
,
1036 newkey
, cmd
.tcpm_keylen
);
1039 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool
*hp
,
1040 __be32 daddr
, __be32 saddr
, int nbytes
)
1042 struct tcp4_pseudohdr
*bp
;
1043 struct scatterlist sg
;
1045 bp
= &hp
->md5_blk
.ip4
;
1048 * 1. the TCP pseudo-header (in the order: source IP address,
1049 * destination IP address, zero-padded protocol number, and
1055 bp
->protocol
= IPPROTO_TCP
;
1056 bp
->len
= cpu_to_be16(nbytes
);
1058 sg_init_one(&sg
, bp
, sizeof(*bp
));
1059 return crypto_hash_update(&hp
->md5_desc
, &sg
, sizeof(*bp
));
1062 static int tcp_v4_md5_hash_hdr(char *md5_hash
, struct tcp_md5sig_key
*key
,
1063 __be32 daddr
, __be32 saddr
, struct tcphdr
*th
)
1065 struct tcp_md5sig_pool
*hp
;
1066 struct hash_desc
*desc
;
1068 hp
= tcp_get_md5sig_pool();
1070 goto clear_hash_noput
;
1071 desc
= &hp
->md5_desc
;
1073 if (crypto_hash_init(desc
))
1075 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, th
->doff
<< 2))
1077 if (tcp_md5_hash_header(hp
, th
))
1079 if (tcp_md5_hash_key(hp
, key
))
1081 if (crypto_hash_final(desc
, md5_hash
))
1084 tcp_put_md5sig_pool();
1088 tcp_put_md5sig_pool();
1090 memset(md5_hash
, 0, 16);
1094 int tcp_v4_md5_hash_skb(char *md5_hash
, struct tcp_md5sig_key
*key
,
1095 struct sock
*sk
, struct request_sock
*req
,
1096 struct sk_buff
*skb
)
1098 struct tcp_md5sig_pool
*hp
;
1099 struct hash_desc
*desc
;
1100 struct tcphdr
*th
= tcp_hdr(skb
);
1101 __be32 saddr
, daddr
;
1104 saddr
= inet_sk(sk
)->inet_saddr
;
1105 daddr
= inet_sk(sk
)->inet_daddr
;
1107 saddr
= inet_rsk(req
)->loc_addr
;
1108 daddr
= inet_rsk(req
)->rmt_addr
;
1110 const struct iphdr
*iph
= ip_hdr(skb
);
1115 hp
= tcp_get_md5sig_pool();
1117 goto clear_hash_noput
;
1118 desc
= &hp
->md5_desc
;
1120 if (crypto_hash_init(desc
))
1123 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, skb
->len
))
1125 if (tcp_md5_hash_header(hp
, th
))
1127 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1129 if (tcp_md5_hash_key(hp
, key
))
1131 if (crypto_hash_final(desc
, md5_hash
))
1134 tcp_put_md5sig_pool();
1138 tcp_put_md5sig_pool();
1140 memset(md5_hash
, 0, 16);
1143 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1145 static int tcp_v4_inbound_md5_hash(struct sock
*sk
, struct sk_buff
*skb
)
1148 * This gets called for each TCP segment that arrives
1149 * so we want to be efficient.
1150 * We have 3 drop cases:
1151 * o No MD5 hash and one expected.
1152 * o MD5 hash and we're not expecting one.
1153 * o MD5 hash and its wrong.
1155 __u8
*hash_location
= NULL
;
1156 struct tcp_md5sig_key
*hash_expected
;
1157 const struct iphdr
*iph
= ip_hdr(skb
);
1158 struct tcphdr
*th
= tcp_hdr(skb
);
1160 unsigned char newhash
[16];
1162 hash_expected
= tcp_v4_md5_do_lookup(sk
, iph
->saddr
);
1163 hash_location
= tcp_parse_md5sig_option(th
);
1165 /* We've parsed the options - do we have a hash? */
1166 if (!hash_expected
&& !hash_location
)
1169 if (hash_expected
&& !hash_location
) {
1170 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1174 if (!hash_expected
&& hash_location
) {
1175 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1179 /* Okay, so this is hash_expected and hash_location -
1180 * so we need to calculate the checksum.
1182 genhash
= tcp_v4_md5_hash_skb(newhash
,
1186 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1187 if (net_ratelimit()) {
1188 printk(KERN_INFO
"MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1189 &iph
->saddr
, ntohs(th
->source
),
1190 &iph
->daddr
, ntohs(th
->dest
),
1191 genhash
? " tcp_v4_calc_md5_hash failed" : "");
1200 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1202 .obj_size
= sizeof(struct tcp_request_sock
),
1203 .rtx_syn_ack
= tcp_v4_rtx_synack
,
1204 .send_ack
= tcp_v4_reqsk_send_ack
,
1205 .destructor
= tcp_v4_reqsk_destructor
,
1206 .send_reset
= tcp_v4_send_reset
,
1207 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1210 #ifdef CONFIG_TCP_MD5SIG
1211 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1212 .md5_lookup
= tcp_v4_reqsk_md5_lookup
,
1213 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1217 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
1218 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
1219 .twsk_unique
= tcp_twsk_unique
,
1220 .twsk_destructor
= tcp_twsk_destructor
,
1223 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1225 struct tcp_extend_values tmp_ext
;
1226 struct tcp_options_received tmp_opt
;
1228 struct request_sock
*req
;
1229 struct inet_request_sock
*ireq
;
1230 struct tcp_sock
*tp
= tcp_sk(sk
);
1231 struct dst_entry
*dst
= NULL
;
1232 __be32 saddr
= ip_hdr(skb
)->saddr
;
1233 __be32 daddr
= ip_hdr(skb
)->daddr
;
1234 __u32 isn
= TCP_SKB_CB(skb
)->when
;
1235 #ifdef CONFIG_SYN_COOKIES
1236 int want_cookie
= 0;
1238 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1241 /* Never answer to SYNs send to broadcast or multicast */
1242 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1245 /* TW buckets are converted to open requests without
1246 * limitations, they conserve resources and peer is
1247 * evidently real one.
1249 if (inet_csk_reqsk_queue_is_full(sk
) && !isn
) {
1250 if (net_ratelimit())
1251 syn_flood_warning(skb
);
1252 #ifdef CONFIG_SYN_COOKIES
1253 if (sysctl_tcp_syncookies
) {
1260 /* Accept backlog is full. If we have already queued enough
1261 * of warm entries in syn queue, drop request. It is better than
1262 * clogging syn queue with openreqs with exponentially increasing
1265 if (sk_acceptq_is_full(sk
) && inet_csk_reqsk_queue_young(sk
) > 1)
1268 req
= inet_reqsk_alloc(&tcp_request_sock_ops
);
1272 #ifdef CONFIG_TCP_MD5SIG
1273 tcp_rsk(req
)->af_specific
= &tcp_request_sock_ipv4_ops
;
1276 tcp_clear_options(&tmp_opt
);
1277 tmp_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
1278 tmp_opt
.user_mss
= tp
->rx_opt
.user_mss
;
1279 tcp_parse_options(skb
, &tmp_opt
, &hash_location
, 0);
1281 if (tmp_opt
.cookie_plus
> 0 &&
1282 tmp_opt
.saw_tstamp
&&
1283 !tp
->rx_opt
.cookie_out_never
&&
1284 (sysctl_tcp_cookie_size
> 0 ||
1285 (tp
->cookie_values
!= NULL
&&
1286 tp
->cookie_values
->cookie_desired
> 0))) {
1288 u32
*mess
= &tmp_ext
.cookie_bakery
[COOKIE_DIGEST_WORDS
];
1289 int l
= tmp_opt
.cookie_plus
- TCPOLEN_COOKIE_BASE
;
1291 if (tcp_cookie_generator(&tmp_ext
.cookie_bakery
[0]) != 0)
1292 goto drop_and_release
;
1294 /* Secret recipe starts with IP addresses */
1295 *mess
++ ^= (__force u32
)daddr
;
1296 *mess
++ ^= (__force u32
)saddr
;
1298 /* plus variable length Initiator Cookie */
1301 *c
++ ^= *hash_location
++;
1303 #ifdef CONFIG_SYN_COOKIES
1304 want_cookie
= 0; /* not our kind of cookie */
1306 tmp_ext
.cookie_out_never
= 0; /* false */
1307 tmp_ext
.cookie_plus
= tmp_opt
.cookie_plus
;
1308 } else if (!tp
->rx_opt
.cookie_in_always
) {
1309 /* redundant indications, but ensure initialization. */
1310 tmp_ext
.cookie_out_never
= 1; /* true */
1311 tmp_ext
.cookie_plus
= 0;
1313 goto drop_and_release
;
1315 tmp_ext
.cookie_in_always
= tp
->rx_opt
.cookie_in_always
;
1317 if (want_cookie
&& !tmp_opt
.saw_tstamp
)
1318 tcp_clear_options(&tmp_opt
);
1320 tmp_opt
.tstamp_ok
= tmp_opt
.saw_tstamp
;
1321 tcp_openreq_init(req
, &tmp_opt
, skb
);
1323 ireq
= inet_rsk(req
);
1324 ireq
->loc_addr
= daddr
;
1325 ireq
->rmt_addr
= saddr
;
1326 ireq
->no_srccheck
= inet_sk(sk
)->transparent
;
1327 ireq
->opt
= tcp_v4_save_options(sk
, skb
);
1329 if (security_inet_conn_request(sk
, skb
, req
))
1332 if (!want_cookie
|| tmp_opt
.tstamp_ok
)
1333 TCP_ECN_create_request(req
, tcp_hdr(skb
));
1336 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1337 req
->cookie_ts
= tmp_opt
.tstamp_ok
;
1339 struct inet_peer
*peer
= NULL
;
1341 /* VJ's idea. We save last timestamp seen
1342 * from the destination in peer table, when entering
1343 * state TIME-WAIT, and check against it before
1344 * accepting new connection request.
1346 * If "isn" is not zero, this request hit alive
1347 * timewait bucket, so that all the necessary checks
1348 * are made in the function processing timewait state.
1350 if (tmp_opt
.saw_tstamp
&&
1351 tcp_death_row
.sysctl_tw_recycle
&&
1352 (dst
= inet_csk_route_req(sk
, req
)) != NULL
&&
1353 (peer
= rt_get_peer((struct rtable
*)dst
)) != NULL
&&
1354 peer
->v4daddr
== saddr
) {
1355 inet_peer_refcheck(peer
);
1356 if ((u32
)get_seconds() - peer
->tcp_ts_stamp
< TCP_PAWS_MSL
&&
1357 (s32
)(peer
->tcp_ts
- req
->ts_recent
) >
1359 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_PAWSPASSIVEREJECTED
);
1360 goto drop_and_release
;
1363 /* Kill the following clause, if you dislike this way. */
1364 else if (!sysctl_tcp_syncookies
&&
1365 (sysctl_max_syn_backlog
- inet_csk_reqsk_queue_len(sk
) <
1366 (sysctl_max_syn_backlog
>> 2)) &&
1367 (!peer
|| !peer
->tcp_ts_stamp
) &&
1368 (!dst
|| !dst_metric(dst
, RTAX_RTT
))) {
1369 /* Without syncookies last quarter of
1370 * backlog is filled with destinations,
1371 * proven to be alive.
1372 * It means that we continue to communicate
1373 * to destinations, already remembered
1374 * to the moment of synflood.
1376 LIMIT_NETDEBUG(KERN_DEBUG
"TCP: drop open request from %pI4/%u\n",
1377 &saddr
, ntohs(tcp_hdr(skb
)->source
));
1378 goto drop_and_release
;
1381 isn
= tcp_v4_init_sequence(skb
);
1383 tcp_rsk(req
)->snt_isn
= isn
;
1385 if (tcp_v4_send_synack(sk
, dst
, req
,
1386 (struct request_values
*)&tmp_ext
) ||
1390 inet_csk_reqsk_queue_hash_add(sk
, req
, TCP_TIMEOUT_INIT
);
1400 EXPORT_SYMBOL(tcp_v4_conn_request
);
1404 * The three way handshake has completed - we got a valid synack -
1405 * now create the new socket.
1407 struct sock
*tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1408 struct request_sock
*req
,
1409 struct dst_entry
*dst
)
1411 struct inet_request_sock
*ireq
;
1412 struct inet_sock
*newinet
;
1413 struct tcp_sock
*newtp
;
1415 #ifdef CONFIG_TCP_MD5SIG
1416 struct tcp_md5sig_key
*key
;
1419 if (sk_acceptq_is_full(sk
))
1422 if (!dst
&& (dst
= inet_csk_route_req(sk
, req
)) == NULL
)
1425 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1429 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1430 sk_setup_caps(newsk
, dst
);
1432 newtp
= tcp_sk(newsk
);
1433 newinet
= inet_sk(newsk
);
1434 ireq
= inet_rsk(req
);
1435 newinet
->inet_daddr
= ireq
->rmt_addr
;
1436 newinet
->inet_rcv_saddr
= ireq
->loc_addr
;
1437 newinet
->inet_saddr
= ireq
->loc_addr
;
1438 newinet
->opt
= ireq
->opt
;
1440 newinet
->mc_index
= inet_iif(skb
);
1441 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1442 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1444 inet_csk(newsk
)->icsk_ext_hdr_len
= newinet
->opt
->optlen
;
1445 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1447 tcp_mtup_init(newsk
);
1448 tcp_sync_mss(newsk
, dst_mtu(dst
));
1449 newtp
->advmss
= dst_metric(dst
, RTAX_ADVMSS
);
1450 if (tcp_sk(sk
)->rx_opt
.user_mss
&&
1451 tcp_sk(sk
)->rx_opt
.user_mss
< newtp
->advmss
)
1452 newtp
->advmss
= tcp_sk(sk
)->rx_opt
.user_mss
;
1454 tcp_initialize_rcv_mss(newsk
);
1456 #ifdef CONFIG_TCP_MD5SIG
1457 /* Copy over the MD5 key from the original socket */
1458 key
= tcp_v4_md5_do_lookup(sk
, newinet
->inet_daddr
);
1461 * We're using one, so create a matching key
1462 * on the newsk structure. If we fail to get
1463 * memory, then we end up not copying the key
1466 char *newkey
= kmemdup(key
->key
, key
->keylen
, GFP_ATOMIC
);
1468 tcp_v4_md5_do_add(newsk
, newinet
->inet_daddr
,
1469 newkey
, key
->keylen
);
1470 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1474 __inet_hash_nolisten(newsk
, NULL
);
1475 __inet_inherit_port(sk
, newsk
);
1480 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1482 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1486 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1488 static struct sock
*tcp_v4_hnd_req(struct sock
*sk
, struct sk_buff
*skb
)
1490 struct tcphdr
*th
= tcp_hdr(skb
);
1491 const struct iphdr
*iph
= ip_hdr(skb
);
1493 struct request_sock
**prev
;
1494 /* Find possible connection requests. */
1495 struct request_sock
*req
= inet_csk_search_req(sk
, &prev
, th
->source
,
1496 iph
->saddr
, iph
->daddr
);
1498 return tcp_check_req(sk
, skb
, req
, prev
);
1500 nsk
= inet_lookup_established(sock_net(sk
), &tcp_hashinfo
, iph
->saddr
,
1501 th
->source
, iph
->daddr
, th
->dest
, inet_iif(skb
));
1504 if (nsk
->sk_state
!= TCP_TIME_WAIT
) {
1508 inet_twsk_put(inet_twsk(nsk
));
1512 #ifdef CONFIG_SYN_COOKIES
1514 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1519 static __sum16
tcp_v4_checksum_init(struct sk_buff
*skb
)
1521 const struct iphdr
*iph
= ip_hdr(skb
);
1523 if (skb
->ip_summed
== CHECKSUM_COMPLETE
) {
1524 if (!tcp_v4_check(skb
->len
, iph
->saddr
,
1525 iph
->daddr
, skb
->csum
)) {
1526 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1531 skb
->csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
1532 skb
->len
, IPPROTO_TCP
, 0);
1534 if (skb
->len
<= 76) {
1535 return __skb_checksum_complete(skb
);
1541 /* The socket must have it's spinlock held when we get
1544 * We have a potential double-lock case here, so even when
1545 * doing backlog processing we use the BH locking scheme.
1546 * This is because we cannot sleep with the original spinlock
1549 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1552 #ifdef CONFIG_TCP_MD5SIG
1554 * We really want to reject the packet as early as possible
1556 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1557 * o There is an MD5 option and we're not expecting one
1559 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1563 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1564 sock_rps_save_rxhash(sk
, skb
->rxhash
);
1565 TCP_CHECK_TIMER(sk
);
1566 if (tcp_rcv_established(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1570 TCP_CHECK_TIMER(sk
);
1574 if (skb
->len
< tcp_hdrlen(skb
) || tcp_checksum_complete(skb
))
1577 if (sk
->sk_state
== TCP_LISTEN
) {
1578 struct sock
*nsk
= tcp_v4_hnd_req(sk
, skb
);
1583 if (tcp_child_process(sk
, nsk
, skb
)) {
1590 sock_rps_save_rxhash(sk
, skb
->rxhash
);
1593 TCP_CHECK_TIMER(sk
);
1594 if (tcp_rcv_state_process(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1598 TCP_CHECK_TIMER(sk
);
1602 tcp_v4_send_reset(rsk
, skb
);
1605 /* Be careful here. If this function gets more complicated and
1606 * gcc suffers from register pressure on the x86, sk (in %ebx)
1607 * might be destroyed here. This current version compiles correctly,
1608 * but you have been warned.
1613 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_INERRS
);
1616 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1622 int BCMFASTPATH_HOST
tcp_v4_rcv(struct sk_buff
*skb
)
1624 const struct iphdr
*iph
;
1628 struct net
*net
= dev_net(skb
->dev
);
1630 if (skb
->pkt_type
!= PACKET_HOST
)
1633 /* Count it even if it's bad */
1634 TCP_INC_STATS_BH(net
, TCP_MIB_INSEGS
);
1636 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1641 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1643 if (!pskb_may_pull(skb
, th
->doff
* 4))
1646 /* An explanation is required here, I think.
1647 * Packet length and doff are validated by header prediction,
1648 * provided case of th->doff==0 is eliminated.
1649 * So, we defer the checks. */
1650 if (!skb_csum_unnecessary(skb
) && tcp_v4_checksum_init(skb
))
1655 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1656 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1657 skb
->len
- th
->doff
* 4);
1658 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1659 TCP_SKB_CB(skb
)->when
= 0;
1660 TCP_SKB_CB(skb
)->flags
= iph
->tos
;
1661 TCP_SKB_CB(skb
)->sacked
= 0;
1663 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, th
->source
, th
->dest
);
1668 if (sk
->sk_state
== TCP_TIME_WAIT
)
1671 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1672 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
1673 goto discard_and_relse
;
1676 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1677 goto discard_and_relse
;
1680 if (sk_filter(sk
, skb
))
1681 goto discard_and_relse
;
1685 bh_lock_sock_nested(sk
);
1687 if (!sock_owned_by_user(sk
)) {
1688 #ifdef CONFIG_NET_DMA
1689 struct tcp_sock
*tp
= tcp_sk(sk
);
1690 if (!tp
->ucopy
.dma_chan
&& tp
->ucopy
.pinned_list
)
1691 tp
->ucopy
.dma_chan
= dma_find_channel(DMA_MEMCPY
);
1692 if (tp
->ucopy
.dma_chan
)
1693 ret
= tcp_v4_do_rcv(sk
, skb
);
1697 if (!tcp_prequeue(sk
, skb
))
1698 ret
= tcp_v4_do_rcv(sk
, skb
);
1700 } else if (unlikely(sk_add_backlog(sk
, skb
))) {
1702 NET_INC_STATS_BH(net
, LINUX_MIB_TCPBACKLOGDROP
);
1703 goto discard_and_relse
;
1712 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
1715 if (skb
->len
< (th
->doff
<< 2) || tcp_checksum_complete(skb
)) {
1717 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
1719 tcp_v4_send_reset(NULL
, skb
);
1723 /* Discard frame. */
1732 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
1733 inet_twsk_put(inet_twsk(sk
));
1737 if (skb
->len
< (th
->doff
<< 2) || tcp_checksum_complete(skb
)) {
1738 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
1739 inet_twsk_put(inet_twsk(sk
));
1742 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
1744 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
1746 iph
->daddr
, th
->dest
,
1749 inet_twsk_deschedule(inet_twsk(sk
), &tcp_death_row
);
1750 inet_twsk_put(inet_twsk(sk
));
1754 /* Fall through to ACK */
1757 tcp_v4_timewait_ack(sk
, skb
);
1761 case TCP_TW_SUCCESS
:;
1766 /* VJ's idea. Save last timestamp seen from this destination
1767 * and hold it at least for normal timewait interval to use for duplicate
1768 * segment detection in subsequent connections, before they enter synchronized
1772 int tcp_v4_remember_stamp(struct sock
*sk
)
1774 struct inet_sock
*inet
= inet_sk(sk
);
1775 struct tcp_sock
*tp
= tcp_sk(sk
);
1776 struct rtable
*rt
= (struct rtable
*)__sk_dst_get(sk
);
1777 struct inet_peer
*peer
= NULL
;
1780 if (!rt
|| rt
->rt_dst
!= inet
->inet_daddr
) {
1781 peer
= inet_getpeer(inet
->inet_daddr
, 1);
1785 rt_bind_peer(rt
, 1);
1790 if ((s32
)(peer
->tcp_ts
- tp
->rx_opt
.ts_recent
) <= 0 ||
1791 ((u32
)get_seconds() - peer
->tcp_ts_stamp
> TCP_PAWS_MSL
&&
1792 peer
->tcp_ts_stamp
<= (u32
)tp
->rx_opt
.ts_recent_stamp
)) {
1793 peer
->tcp_ts_stamp
= (u32
)tp
->rx_opt
.ts_recent_stamp
;
1794 peer
->tcp_ts
= tp
->rx_opt
.ts_recent
;
1803 EXPORT_SYMBOL(tcp_v4_remember_stamp
);
1805 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock
*tw
)
1807 struct inet_peer
*peer
= inet_getpeer(tw
->tw_daddr
, 1);
1810 const struct tcp_timewait_sock
*tcptw
= tcp_twsk((struct sock
*)tw
);
1812 if ((s32
)(peer
->tcp_ts
- tcptw
->tw_ts_recent
) <= 0 ||
1813 ((u32
)get_seconds() - peer
->tcp_ts_stamp
> TCP_PAWS_MSL
&&
1814 peer
->tcp_ts_stamp
<= (u32
)tcptw
->tw_ts_recent_stamp
)) {
1815 peer
->tcp_ts_stamp
= (u32
)tcptw
->tw_ts_recent_stamp
;
1816 peer
->tcp_ts
= tcptw
->tw_ts_recent
;
1825 const struct inet_connection_sock_af_ops ipv4_specific
= {
1826 .queue_xmit
= ip_queue_xmit
,
1827 .send_check
= tcp_v4_send_check
,
1828 .rebuild_header
= inet_sk_rebuild_header
,
1829 .conn_request
= tcp_v4_conn_request
,
1830 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
1831 .remember_stamp
= tcp_v4_remember_stamp
,
1832 .net_header_len
= sizeof(struct iphdr
),
1833 .setsockopt
= ip_setsockopt
,
1834 .getsockopt
= ip_getsockopt
,
1835 .addr2sockaddr
= inet_csk_addr2sockaddr
,
1836 .sockaddr_len
= sizeof(struct sockaddr_in
),
1837 .bind_conflict
= inet_csk_bind_conflict
,
1838 #ifdef CONFIG_COMPAT
1839 .compat_setsockopt
= compat_ip_setsockopt
,
1840 .compat_getsockopt
= compat_ip_getsockopt
,
1843 EXPORT_SYMBOL(ipv4_specific
);
1845 #ifdef CONFIG_TCP_MD5SIG
1846 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
1847 .md5_lookup
= tcp_v4_md5_lookup
,
1848 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1849 .md5_add
= tcp_v4_md5_add_func
,
1850 .md5_parse
= tcp_v4_parse_md5_keys
,
1854 /* NOTE: A lot of things set to zero explicitly by call to
1855 * sk_alloc() so need not be done here.
1857 static int tcp_v4_init_sock(struct sock
*sk
)
1859 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1860 struct tcp_sock
*tp
= tcp_sk(sk
);
1862 skb_queue_head_init(&tp
->out_of_order_queue
);
1863 tcp_init_xmit_timers(sk
);
1864 tcp_prequeue_init(tp
);
1866 icsk
->icsk_rto
= TCP_TIMEOUT_INIT
;
1867 tp
->mdev
= TCP_TIMEOUT_INIT
;
1869 /* So many TCP implementations out there (incorrectly) count the
1870 * initial SYN frame in their delayed-ACK and congestion control
1871 * algorithms that we must have the following bandaid to talk
1872 * efficiently to them. -DaveM
1876 /* See draft-stevens-tcpca-spec-01 for discussion of the
1877 * initialization of these values.
1879 tp
->snd_ssthresh
= TCP_INFINITE_SSTHRESH
;
1880 tp
->snd_cwnd_clamp
= ~0;
1881 tp
->mss_cache
= TCP_MSS_DEFAULT
;
1883 tp
->reordering
= sysctl_tcp_reordering
;
1884 icsk
->icsk_ca_ops
= &tcp_init_congestion_ops
;
1886 sk
->sk_state
= TCP_CLOSE
;
1888 sk
->sk_write_space
= sk_stream_write_space
;
1889 sock_set_flag(sk
, SOCK_USE_WRITE_QUEUE
);
1891 icsk
->icsk_af_ops
= &ipv4_specific
;
1892 icsk
->icsk_sync_mss
= tcp_sync_mss
;
1893 #ifdef CONFIG_TCP_MD5SIG
1894 tp
->af_specific
= &tcp_sock_ipv4_specific
;
1897 /* TCP Cookie Transactions */
1898 if (sysctl_tcp_cookie_size
> 0) {
1899 /* Default, cookies without s_data_payload. */
1901 kzalloc(sizeof(*tp
->cookie_values
),
1903 if (tp
->cookie_values
!= NULL
)
1904 kref_init(&tp
->cookie_values
->kref
);
1906 /* Presumed zeroed, in order of appearance:
1907 * cookie_in_always, cookie_out_never,
1908 * s_data_constant, s_data_in, s_data_out
1910 sk
->sk_sndbuf
= sysctl_tcp_wmem
[1];
1911 sk
->sk_rcvbuf
= sysctl_tcp_rmem
[1];
1914 percpu_counter_inc(&tcp_sockets_allocated
);
1920 void tcp_v4_destroy_sock(struct sock
*sk
)
1922 struct tcp_sock
*tp
= tcp_sk(sk
);
1924 tcp_clear_xmit_timers(sk
);
1926 tcp_cleanup_congestion_control(sk
);
1928 /* Cleanup up the write buffer. */
1929 tcp_write_queue_purge(sk
);
1931 /* Cleans up our, hopefully empty, out_of_order_queue. */
1932 __skb_queue_purge(&tp
->out_of_order_queue
);
1934 #ifdef CONFIG_TCP_MD5SIG
1935 /* Clean up the MD5 key list, if any */
1936 if (tp
->md5sig_info
) {
1937 tcp_v4_clear_md5_list(sk
);
1938 kfree(tp
->md5sig_info
);
1939 tp
->md5sig_info
= NULL
;
1943 #ifdef CONFIG_NET_DMA
1944 /* Cleans up our sk_async_wait_queue */
1945 __skb_queue_purge(&sk
->sk_async_wait_queue
);
1948 /* Clean prequeue, it must be empty really */
1949 __skb_queue_purge(&tp
->ucopy
.prequeue
);
1951 /* Clean up a referenced TCP bind bucket. */
1952 if (inet_csk(sk
)->icsk_bind_hash
)
1956 * If sendmsg cached page exists, toss it.
1958 if (sk
->sk_sndmsg_page
) {
1959 __free_page(sk
->sk_sndmsg_page
);
1960 sk
->sk_sndmsg_page
= NULL
;
1963 /* TCP Cookie Transactions */
1964 if (tp
->cookie_values
!= NULL
) {
1965 kref_put(&tp
->cookie_values
->kref
,
1966 tcp_cookie_values_release
);
1967 tp
->cookie_values
= NULL
;
1970 percpu_counter_dec(&tcp_sockets_allocated
);
1972 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
1974 #ifdef CONFIG_PROC_FS
1975 /* Proc filesystem TCP sock list dumping. */
1977 static inline struct inet_timewait_sock
*tw_head(struct hlist_nulls_head
*head
)
1979 return hlist_nulls_empty(head
) ? NULL
:
1980 list_entry(head
->first
, struct inet_timewait_sock
, tw_node
);
1983 static inline struct inet_timewait_sock
*tw_next(struct inet_timewait_sock
*tw
)
1985 return !is_a_nulls(tw
->tw_node
.next
) ?
1986 hlist_nulls_entry(tw
->tw_node
.next
, typeof(*tw
), tw_node
) : NULL
;
1990 * Get next listener socket follow cur. If cur is NULL, get first socket
1991 * starting from bucket given in st->bucket; when st->bucket is zero the
1992 * very first socket in the hash table is returned.
1994 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
1996 struct inet_connection_sock
*icsk
;
1997 struct hlist_nulls_node
*node
;
1998 struct sock
*sk
= cur
;
1999 struct inet_listen_hashbucket
*ilb
;
2000 struct tcp_iter_state
*st
= seq
->private;
2001 struct net
*net
= seq_file_net(seq
);
2004 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2005 spin_lock_bh(&ilb
->lock
);
2006 sk
= sk_nulls_head(&ilb
->head
);
2010 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2014 if (st
->state
== TCP_SEQ_STATE_OPENREQ
) {
2015 struct request_sock
*req
= cur
;
2017 icsk
= inet_csk(st
->syn_wait_sk
);
2021 if (req
->rsk_ops
->family
== st
->family
) {
2028 if (++st
->sbucket
>= icsk
->icsk_accept_queue
.listen_opt
->nr_table_entries
)
2031 req
= icsk
->icsk_accept_queue
.listen_opt
->syn_table
[st
->sbucket
];
2033 sk
= sk_next(st
->syn_wait_sk
);
2034 st
->state
= TCP_SEQ_STATE_LISTENING
;
2035 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2037 icsk
= inet_csk(sk
);
2038 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2039 if (reqsk_queue_len(&icsk
->icsk_accept_queue
))
2041 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2045 sk_nulls_for_each_from(sk
, node
) {
2046 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
)) {
2050 icsk
= inet_csk(sk
);
2051 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2052 if (reqsk_queue_len(&icsk
->icsk_accept_queue
)) {
2054 st
->uid
= sock_i_uid(sk
);
2055 st
->syn_wait_sk
= sk
;
2056 st
->state
= TCP_SEQ_STATE_OPENREQ
;
2060 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2062 spin_unlock_bh(&ilb
->lock
);
2064 if (++st
->bucket
< INET_LHTABLE_SIZE
) {
2065 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2066 spin_lock_bh(&ilb
->lock
);
2067 sk
= sk_nulls_head(&ilb
->head
);
2075 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2077 struct tcp_iter_state
*st
= seq
->private;
2082 rc
= listening_get_next(seq
, NULL
);
2084 while (rc
&& *pos
) {
2085 rc
= listening_get_next(seq
, rc
);
2091 static inline int empty_bucket(struct tcp_iter_state
*st
)
2093 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
) &&
2094 hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2098 * Get first established socket starting from bucket given in st->bucket.
2099 * If st->bucket is zero, the very first socket in the hash is returned.
2101 static void *established_get_first(struct seq_file
*seq
)
2103 struct tcp_iter_state
*st
= seq
->private;
2104 struct net
*net
= seq_file_net(seq
);
2108 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2110 struct hlist_nulls_node
*node
;
2111 struct inet_timewait_sock
*tw
;
2112 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2114 /* Lockless fast path for the common case of empty buckets */
2115 if (empty_bucket(st
))
2119 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2120 if (sk
->sk_family
!= st
->family
||
2121 !net_eq(sock_net(sk
), net
)) {
2127 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2128 inet_twsk_for_each(tw
, node
,
2129 &tcp_hashinfo
.ehash
[st
->bucket
].twchain
) {
2130 if (tw
->tw_family
!= st
->family
||
2131 !net_eq(twsk_net(tw
), net
)) {
2137 spin_unlock_bh(lock
);
2138 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2144 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2146 struct sock
*sk
= cur
;
2147 struct inet_timewait_sock
*tw
;
2148 struct hlist_nulls_node
*node
;
2149 struct tcp_iter_state
*st
= seq
->private;
2150 struct net
*net
= seq_file_net(seq
);
2155 if (st
->state
== TCP_SEQ_STATE_TIME_WAIT
) {
2159 while (tw
&& (tw
->tw_family
!= st
->family
|| !net_eq(twsk_net(tw
), net
))) {
2166 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2167 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2169 /* Look for next non empty bucket */
2171 while (++st
->bucket
<= tcp_hashinfo
.ehash_mask
&&
2174 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2177 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2178 sk
= sk_nulls_head(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2180 sk
= sk_nulls_next(sk
);
2182 sk_nulls_for_each_from(sk
, node
) {
2183 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
))
2187 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2188 tw
= tw_head(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2196 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2198 struct tcp_iter_state
*st
= seq
->private;
2202 rc
= established_get_first(seq
);
2205 rc
= established_get_next(seq
, rc
);
2211 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2214 struct tcp_iter_state
*st
= seq
->private;
2216 st
->state
= TCP_SEQ_STATE_LISTENING
;
2217 rc
= listening_get_idx(seq
, &pos
);
2220 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2221 rc
= established_get_idx(seq
, pos
);
2227 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2229 struct tcp_iter_state
*st
= seq
->private;
2230 int offset
= st
->offset
;
2231 int orig_num
= st
->num
;
2234 switch (st
->state
) {
2235 case TCP_SEQ_STATE_OPENREQ
:
2236 case TCP_SEQ_STATE_LISTENING
:
2237 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2239 st
->state
= TCP_SEQ_STATE_LISTENING
;
2240 rc
= listening_get_next(seq
, NULL
);
2241 while (offset
-- && rc
)
2242 rc
= listening_get_next(seq
, rc
);
2247 case TCP_SEQ_STATE_ESTABLISHED
:
2248 case TCP_SEQ_STATE_TIME_WAIT
:
2249 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2250 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2252 rc
= established_get_first(seq
);
2253 while (offset
-- && rc
)
2254 rc
= established_get_next(seq
, rc
);
2262 static void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2264 struct tcp_iter_state
*st
= seq
->private;
2267 if (*pos
&& *pos
== st
->last_pos
) {
2268 rc
= tcp_seek_last_pos(seq
);
2273 st
->state
= TCP_SEQ_STATE_LISTENING
;
2277 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2280 st
->last_pos
= *pos
;
2284 static void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2286 struct tcp_iter_state
*st
= seq
->private;
2289 if (v
== SEQ_START_TOKEN
) {
2290 rc
= tcp_get_idx(seq
, 0);
2294 switch (st
->state
) {
2295 case TCP_SEQ_STATE_OPENREQ
:
2296 case TCP_SEQ_STATE_LISTENING
:
2297 rc
= listening_get_next(seq
, v
);
2299 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2302 rc
= established_get_first(seq
);
2305 case TCP_SEQ_STATE_ESTABLISHED
:
2306 case TCP_SEQ_STATE_TIME_WAIT
:
2307 rc
= established_get_next(seq
, v
);
2312 st
->last_pos
= *pos
;
2316 static void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2318 struct tcp_iter_state
*st
= seq
->private;
2320 switch (st
->state
) {
2321 case TCP_SEQ_STATE_OPENREQ
:
2323 struct inet_connection_sock
*icsk
= inet_csk(st
->syn_wait_sk
);
2324 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2326 case TCP_SEQ_STATE_LISTENING
:
2327 if (v
!= SEQ_START_TOKEN
)
2328 spin_unlock_bh(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2330 case TCP_SEQ_STATE_TIME_WAIT
:
2331 case TCP_SEQ_STATE_ESTABLISHED
:
2333 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2338 static int tcp_seq_open(struct inode
*inode
, struct file
*file
)
2340 struct tcp_seq_afinfo
*afinfo
= PDE(inode
)->data
;
2341 struct tcp_iter_state
*s
;
2344 err
= seq_open_net(inode
, file
, &afinfo
->seq_ops
,
2345 sizeof(struct tcp_iter_state
));
2349 s
= ((struct seq_file
*)file
->private_data
)->private;
2350 s
->family
= afinfo
->family
;
2355 int tcp_proc_register(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2358 struct proc_dir_entry
*p
;
2360 afinfo
->seq_fops
.open
= tcp_seq_open
;
2361 afinfo
->seq_fops
.read
= seq_read
;
2362 afinfo
->seq_fops
.llseek
= seq_lseek
;
2363 afinfo
->seq_fops
.release
= seq_release_net
;
2365 afinfo
->seq_ops
.start
= tcp_seq_start
;
2366 afinfo
->seq_ops
.next
= tcp_seq_next
;
2367 afinfo
->seq_ops
.stop
= tcp_seq_stop
;
2369 p
= proc_create_data(afinfo
->name
, S_IRUGO
, net
->proc_net
,
2370 &afinfo
->seq_fops
, afinfo
);
2375 EXPORT_SYMBOL(tcp_proc_register
);
2377 void tcp_proc_unregister(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2379 proc_net_remove(net
, afinfo
->name
);
2381 EXPORT_SYMBOL(tcp_proc_unregister
);
2383 static void get_openreq4(struct sock
*sk
, struct request_sock
*req
,
2384 struct seq_file
*f
, int i
, int uid
, int *len
)
2386 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2387 int ttd
= req
->expires
- jiffies
;
2389 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2390 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2393 ntohs(inet_sk(sk
)->inet_sport
),
2395 ntohs(ireq
->rmt_port
),
2397 0, 0, /* could print option size, but that is af dependent. */
2398 1, /* timers active (only the expire timer) */
2399 jiffies_to_clock_t(ttd
),
2402 0, /* non standard timer */
2403 0, /* open_requests have no inode */
2404 atomic_read(&sk
->sk_refcnt
),
2409 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
, int *len
)
2412 unsigned long timer_expires
;
2413 struct tcp_sock
*tp
= tcp_sk(sk
);
2414 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2415 struct inet_sock
*inet
= inet_sk(sk
);
2416 __be32 dest
= inet
->inet_daddr
;
2417 __be32 src
= inet
->inet_rcv_saddr
;
2418 __u16 destp
= ntohs(inet
->inet_dport
);
2419 __u16 srcp
= ntohs(inet
->inet_sport
);
2422 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
) {
2424 timer_expires
= icsk
->icsk_timeout
;
2425 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2427 timer_expires
= icsk
->icsk_timeout
;
2428 } else if (timer_pending(&sk
->sk_timer
)) {
2430 timer_expires
= sk
->sk_timer
.expires
;
2433 timer_expires
= jiffies
;
2436 if (sk
->sk_state
== TCP_LISTEN
)
2437 rx_queue
= sk
->sk_ack_backlog
;
2440 * because we dont lock socket, we might find a transient negative value
2442 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2444 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2445 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2446 i
, src
, srcp
, dest
, destp
, sk
->sk_state
,
2447 tp
->write_seq
- tp
->snd_una
,
2450 jiffies_to_clock_t(timer_expires
- jiffies
),
2451 icsk
->icsk_retransmits
,
2453 icsk
->icsk_probes_out
,
2455 atomic_read(&sk
->sk_refcnt
), sk
,
2456 jiffies_to_clock_t(icsk
->icsk_rto
),
2457 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2458 (icsk
->icsk_ack
.quick
<< 1) | icsk
->icsk_ack
.pingpong
,
2460 tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
,
2464 static void get_timewait4_sock(struct inet_timewait_sock
*tw
,
2465 struct seq_file
*f
, int i
, int *len
)
2469 int ttd
= tw
->tw_ttd
- jiffies
;
2474 dest
= tw
->tw_daddr
;
2475 src
= tw
->tw_rcv_saddr
;
2476 destp
= ntohs(tw
->tw_dport
);
2477 srcp
= ntohs(tw
->tw_sport
);
2479 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2480 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2481 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2482 3, jiffies_to_clock_t(ttd
), 0, 0, 0, 0,
2483 atomic_read(&tw
->tw_refcnt
), tw
, len
);
2488 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2490 struct tcp_iter_state
*st
;
2493 if (v
== SEQ_START_TOKEN
) {
2494 seq_printf(seq
, "%-*s\n", TMPSZ
- 1,
2495 " sl local_address rem_address st tx_queue "
2496 "rx_queue tr tm->when retrnsmt uid timeout "
2502 switch (st
->state
) {
2503 case TCP_SEQ_STATE_LISTENING
:
2504 case TCP_SEQ_STATE_ESTABLISHED
:
2505 get_tcp4_sock(v
, seq
, st
->num
, &len
);
2507 case TCP_SEQ_STATE_OPENREQ
:
2508 get_openreq4(st
->syn_wait_sk
, v
, seq
, st
->num
, st
->uid
, &len
);
2510 case TCP_SEQ_STATE_TIME_WAIT
:
2511 get_timewait4_sock(v
, seq
, st
->num
, &len
);
2514 seq_printf(seq
, "%*s\n", TMPSZ
- 1 - len
, "");
2519 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2523 .owner
= THIS_MODULE
,
2526 .show
= tcp4_seq_show
,
2530 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2532 return tcp_proc_register(net
, &tcp4_seq_afinfo
);
2535 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2537 tcp_proc_unregister(net
, &tcp4_seq_afinfo
);
2540 static struct pernet_operations tcp4_net_ops
= {
2541 .init
= tcp4_proc_init_net
,
2542 .exit
= tcp4_proc_exit_net
,
2545 int __init
tcp4_proc_init(void)
2547 return register_pernet_subsys(&tcp4_net_ops
);
2550 void tcp4_proc_exit(void)
2552 unregister_pernet_subsys(&tcp4_net_ops
);
2554 #endif /* CONFIG_PROC_FS */
2556 #ifdef CONFIG_INET_GRO
2557 extern atomic_t gro_timer_init
;
2559 struct sk_buff
**tcp4_gro_receive(struct sk_buff
**head
, struct sk_buff
*skb
)
2563 #ifdef CONFIG_INET_GRO
2564 if (atomic_read(&gro_timer_init
))
2565 return tcp_gro_receive(head
, skb
);
2567 /* We don't support hw-checksum. Skip this part to do real TCP merge */
2568 iph
= skb_gro_network_header(skb
);
2569 switch (skb
->ip_summed
) {
2570 case CHECKSUM_COMPLETE
:
2571 if (!tcp_v4_check(skb_gro_len(skb
), iph
->saddr
, iph
->daddr
,
2573 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2579 NAPI_GRO_CB(skb
)->flush
= 1;
2583 return tcp_gro_receive(head
, skb
);
2584 #endif /* CONFIG_INET_GRO */
2586 EXPORT_SYMBOL(tcp4_gro_receive
);
2588 int BCMFASTPATH_HOST
tcp4_gro_complete(struct sk_buff
*skb
)
2590 struct iphdr
*iph
= ip_hdr(skb
);
2591 struct tcphdr
*th
= tcp_hdr(skb
);
2593 th
->check
= ~tcp_v4_check(skb
->len
- skb_transport_offset(skb
),
2594 iph
->saddr
, iph
->daddr
, 0);
2595 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
2597 return tcp_gro_complete(skb
);
2599 EXPORT_SYMBOL(tcp4_gro_complete
);
2601 struct proto tcp_prot
= {
2603 .owner
= THIS_MODULE
,
2605 .connect
= tcp_v4_connect
,
2606 .disconnect
= tcp_disconnect
,
2607 .accept
= inet_csk_accept
,
2609 .init
= tcp_v4_init_sock
,
2610 .destroy
= tcp_v4_destroy_sock
,
2611 .shutdown
= tcp_shutdown
,
2612 .setsockopt
= tcp_setsockopt
,
2613 .getsockopt
= tcp_getsockopt
,
2614 .recvmsg
= tcp_recvmsg
,
2615 .sendmsg
= tcp_sendmsg
,
2616 .sendpage
= tcp_sendpage
,
2617 .backlog_rcv
= tcp_v4_do_rcv
,
2619 .unhash
= inet_unhash
,
2620 .get_port
= inet_csk_get_port
,
2621 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2622 .sockets_allocated
= &tcp_sockets_allocated
,
2623 .orphan_count
= &tcp_orphan_count
,
2624 .memory_allocated
= &tcp_memory_allocated
,
2625 .memory_pressure
= &tcp_memory_pressure
,
2626 .sysctl_mem
= sysctl_tcp_mem
,
2627 .sysctl_wmem
= sysctl_tcp_wmem
,
2628 .sysctl_rmem
= sysctl_tcp_rmem
,
2629 .max_header
= MAX_TCP_HEADER
,
2630 .obj_size
= sizeof(struct tcp_sock
),
2631 .slab_flags
= SLAB_DESTROY_BY_RCU
,
2632 .twsk_prot
= &tcp_timewait_sock_ops
,
2633 .rsk_prot
= &tcp_request_sock_ops
,
2634 .h
.hashinfo
= &tcp_hashinfo
,
2635 .no_autobind
= true,
2636 #ifdef CONFIG_COMPAT
2637 .compat_setsockopt
= compat_tcp_setsockopt
,
2638 .compat_getsockopt
= compat_tcp_getsockopt
,
2641 EXPORT_SYMBOL(tcp_prot
);
2644 static int __net_init
tcp_sk_init(struct net
*net
)
2646 return inet_ctl_sock_create(&net
->ipv4
.tcp_sock
,
2647 PF_INET
, SOCK_RAW
, IPPROTO_TCP
, net
);
2650 static void __net_exit
tcp_sk_exit(struct net
*net
)
2652 inet_ctl_sock_destroy(net
->ipv4
.tcp_sock
);
2655 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2657 inet_twsk_purge(&tcp_hashinfo
, &tcp_death_row
, AF_INET
);
2660 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2661 .init
= tcp_sk_init
,
2662 .exit
= tcp_sk_exit
,
2663 .exit_batch
= tcp_sk_exit_batch
,
2666 void __init
tcp_v4_init(void)
2668 inet_hashinfo_init(&tcp_hashinfo
);
2669 if (register_pernet_subsys(&tcp_sk_ops
))
2670 panic("Failed to create the TCP control socket.\n");