2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.182 1999/07/05 01:34:07 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
55 #include <linux/ipsec.h>
61 #include <asm/segment.h>
63 #include <linux/inet.h>
64 #include <linux/stddef.h>
66 extern int sysctl_tcp_timestamps
;
67 extern int sysctl_tcp_window_scaling
;
68 extern int sysctl_tcp_sack
;
69 extern int sysctl_tcp_syncookies
;
70 extern int sysctl_ip_dynaddr
;
71 extern __u32 sysctl_wmem_max
;
72 extern __u32 sysctl_rmem_max
;
74 /* Check TCP sequence numbers in ICMP packets. */
75 #define ICMP_MIN_LENGTH 8
77 /* Socket used for sending RSTs */
78 struct inode tcp_inode
;
79 struct socket
*tcp_socket
=&tcp_inode
.u
.socket_i
;
81 static void tcp_v4_send_reset(struct sk_buff
*skb
);
83 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
86 /* This is for sockets with full identity only. Sockets here will always
87 * be without wildcards and will have the following invariant:
88 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
90 * First half of the table is for sockets not in TIME_WAIT, second half
91 * is for TIME_WAIT sockets only.
93 struct sock
**tcp_ehash
;
96 /* Ok, let's try this, I give up, we do need a local binding
97 * TCP hash as well as the others for fast bind/connect.
99 struct tcp_bind_bucket
**tcp_bhash
;
102 /* All sockets in TCP_LISTEN state will be in here. This is the only table
103 * where wildcard'd TCP sockets can exist. Hash function here is just local
106 struct sock
*tcp_listening_hash
[TCP_LHTABLE_SIZE
];
108 /* Register cache. */
109 struct sock
*tcp_regs
[TCP_NUM_REGS
];
112 * This array holds the first and last local port number.
113 * For high-usage systems, use sysctl to change this to
116 int sysctl_local_port_range
[2] = { 1024, 4999 };
117 int tcp_port_rover
= (1024 - 1);
119 static __inline__
int tcp_hashfn(__u32 laddr
, __u16 lport
,
120 __u32 faddr
, __u16 fport
)
122 return ((laddr
^ lport
) ^ (faddr
^ fport
)) & ((tcp_ehash_size
>> 1) - 1);
125 static __inline__
int tcp_sk_hashfn(struct sock
*sk
)
127 __u32 laddr
= sk
->rcv_saddr
;
128 __u16 lport
= sk
->num
;
129 __u32 faddr
= sk
->daddr
;
130 __u16 fport
= sk
->dport
;
132 return tcp_hashfn(laddr
, lport
, faddr
, fport
);
135 /* Allocate and initialize a new TCP local port bind bucket.
136 * The sockhash lock must be held as a writer here.
138 struct tcp_bind_bucket
*tcp_bucket_create(unsigned short snum
)
140 struct tcp_bind_bucket
*tb
;
142 tb
= kmem_cache_alloc(tcp_bucket_cachep
, SLAB_ATOMIC
);
144 struct tcp_bind_bucket
**head
=
145 &tcp_bhash
[tcp_bhashfn(snum
)];
149 if((tb
->next
= *head
) != NULL
)
150 tb
->next
->pprev
= &tb
->next
;
157 #ifdef CONFIG_IP_TRANSPARENT_PROXY
158 /* Ensure that the bound bucket for the port exists.
159 * Return 0 on success.
161 static __inline__
int tcp_bucket_check(unsigned short snum
)
163 struct tcp_bind_bucket
*tb
;
166 SOCKHASH_LOCK_WRITE();
167 tb
= tcp_bhash
[tcp_bhashfn(snum
)];
168 for( ; (tb
&& (tb
->port
!= snum
)); tb
= tb
->next
)
172 if ((tb
= tcp_bucket_create(snum
)) == NULL
)
175 SOCKHASH_UNLOCK_WRITE();
181 static __inline__
void __tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
183 struct tcp_bind_bucket
*tb
= (struct tcp_bind_bucket
*)sk
->prev
;
185 if ((child
->bind_next
= tb
->owners
) != NULL
)
186 tb
->owners
->bind_pprev
= &child
->bind_next
;
188 child
->bind_pprev
= &tb
->owners
;
189 child
->prev
= (struct sock
*) tb
;
192 __inline__
void tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
194 SOCKHASH_LOCK_WRITE();
195 __tcp_inherit_port(sk
, child
);
196 SOCKHASH_UNLOCK_WRITE();
199 /* Obtain a reference to a local port for the given sock,
200 * if snum is zero it means select any available local port.
202 static int tcp_v4_get_port(struct sock
*sk
, unsigned short snum
)
204 struct tcp_bind_bucket
*tb
;
206 SOCKHASH_LOCK_WRITE();
208 int rover
= tcp_port_rover
;
209 int low
= sysctl_local_port_range
[0];
210 int high
= sysctl_local_port_range
[1];
211 int remaining
= (high
- low
) + 1;
214 if ((rover
< low
) || (rover
> high
))
216 tb
= tcp_bhash
[tcp_bhashfn(rover
)];
217 for ( ; tb
; tb
= tb
->next
)
218 if (tb
->port
== rover
)
222 } while (--remaining
> 0);
223 tcp_port_rover
= rover
;
225 /* Exhausted local port range during search? */
229 /* OK, here is the one we will use. */
233 for (tb
= tcp_bhash
[tcp_bhashfn(snum
)];
236 if (tb
->port
== snum
)
239 if (tb
!= NULL
&& tb
->owners
!= NULL
) {
240 if (tb
->fastreuse
!= 0 && sk
->reuse
!= 0) {
243 struct sock
*sk2
= tb
->owners
;
244 int sk_reuse
= sk
->reuse
;
246 for( ; sk2
!= NULL
; sk2
= sk2
->bind_next
) {
247 if (sk
->bound_dev_if
== sk2
->bound_dev_if
) {
250 sk2
->state
== TCP_LISTEN
) {
251 if (!sk2
->rcv_saddr
||
253 (sk2
->rcv_saddr
== sk
->rcv_saddr
))
258 /* If we found a conflict, fail. */
264 (tb
= tcp_bucket_create(snum
)) == NULL
)
266 if (tb
->owners
== NULL
) {
267 if (sk
->reuse
&& sk
->state
!= TCP_LISTEN
)
271 } else if (tb
->fastreuse
&&
272 ((sk
->reuse
== 0) || (sk
->state
== TCP_LISTEN
)))
276 if ((sk
->bind_next
= tb
->owners
) != NULL
)
277 tb
->owners
->bind_pprev
= &sk
->bind_next
;
279 sk
->bind_pprev
= &tb
->owners
;
280 sk
->prev
= (struct sock
*) tb
;
282 SOCKHASH_UNLOCK_WRITE();
286 SOCKHASH_UNLOCK_WRITE();
290 /* Get rid of any references to a local port held by the
293 __inline__
void __tcp_put_port(struct sock
*sk
)
295 struct tcp_bind_bucket
*tb
;
297 tb
= (struct tcp_bind_bucket
*) sk
->prev
;
299 sk
->bind_next
->bind_pprev
= sk
->bind_pprev
;
300 *(sk
->bind_pprev
) = sk
->bind_next
;
302 if (tb
->owners
== NULL
) {
304 tb
->next
->pprev
= tb
->pprev
;
305 *(tb
->pprev
) = tb
->next
;
306 kmem_cache_free(tcp_bucket_cachep
, tb
);
310 void tcp_put_port(struct sock
*sk
)
312 SOCKHASH_LOCK_WRITE();
314 SOCKHASH_UNLOCK_WRITE();
317 static __inline__
void __tcp_v4_hash(struct sock
*sk
)
321 if(sk
->state
== TCP_LISTEN
)
322 skp
= &tcp_listening_hash
[tcp_sk_listen_hashfn(sk
)];
324 skp
= &tcp_ehash
[(sk
->hashent
= tcp_sk_hashfn(sk
))];
326 if((sk
->next
= *skp
) != NULL
)
327 (*skp
)->pprev
= &sk
->next
;
331 if(sk
->prot
->highestinuse
< sk
->prot
->inuse
)
332 sk
->prot
->highestinuse
= sk
->prot
->inuse
;
335 static void tcp_v4_hash(struct sock
*sk
)
337 if (sk
->state
!= TCP_CLOSE
) {
338 SOCKHASH_LOCK_WRITE();
340 SOCKHASH_UNLOCK_WRITE();
344 static void tcp_v4_unhash(struct sock
*sk
)
346 SOCKHASH_LOCK_WRITE();
349 sk
->next
->pprev
= sk
->pprev
;
350 *sk
->pprev
= sk
->next
;
356 SOCKHASH_UNLOCK_WRITE();
359 /* Don't inline this cruft. Here are some nice properties to
360 * exploit here. The BSD API does not allow a listening TCP
361 * to specify the remote port nor the remote address for the
362 * connection. So always assume those are both wildcarded
363 * during the search since they can never be otherwise.
365 static struct sock
*tcp_v4_lookup_listener(u32 daddr
, unsigned short hnum
, int dif
)
368 struct sock
*result
= NULL
;
372 for(sk
= tcp_listening_hash
[tcp_lhashfn(hnum
)]; sk
; sk
= sk
->next
) {
373 if(sk
->num
== hnum
) {
374 __u32 rcv_saddr
= sk
->rcv_saddr
;
378 if (rcv_saddr
!= daddr
)
382 if (sk
->bound_dev_if
) {
383 if (sk
->bound_dev_if
!= dif
)
389 if (score
> hiscore
) {
398 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
399 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
401 * The sockhash lock must be held as a reader here.
403 static inline struct sock
*__tcp_v4_lookup(u32 saddr
, u16 sport
,
404 u32 daddr
, u16 dport
, int dif
)
406 TCP_V4_ADDR_COOKIE(acookie
, saddr
, daddr
)
407 __u16 hnum
= ntohs(dport
);
408 __u32 ports
= TCP_COMBINED_PORTS(sport
, hnum
);
412 /* Check TCP register quick cache first. */
413 sk
= TCP_RHASH(sport
);
414 if(sk
&& TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
417 /* Optimize here for direct hit, only listening connections can
418 * have wildcards anyways.
420 hash
= tcp_hashfn(daddr
, hnum
, saddr
, sport
);
421 for(sk
= tcp_ehash
[hash
]; sk
; sk
= sk
->next
) {
422 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
)) {
423 if (sk
->state
== TCP_ESTABLISHED
)
424 TCP_RHASH(sport
) = sk
;
425 goto hit
; /* You sunk my battleship! */
428 /* Must check for a TIME_WAIT'er before going to listener hash. */
429 for(sk
= tcp_ehash
[hash
+(tcp_ehash_size
>> 1)]; sk
; sk
= sk
->next
)
430 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
432 sk
= tcp_v4_lookup_listener(daddr
, hnum
, dif
);
437 __inline__
struct sock
*tcp_v4_lookup(u32 saddr
, u16 sport
, u32 daddr
, u16 dport
, int dif
)
441 SOCKHASH_LOCK_READ();
442 sk
= __tcp_v4_lookup(saddr
, sport
, daddr
, dport
, dif
);
443 SOCKHASH_UNLOCK_READ();
448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
449 /* Cleaned up a little and adapted to new bind bucket scheme.
450 * Oddly, this should increase performance here for
451 * transparent proxy, as tests within the inner loop have
452 * been eliminated. -DaveM
454 static struct sock
*tcp_v4_proxy_lookup(unsigned short num
, unsigned long raddr
,
455 unsigned short rnum
, unsigned long laddr
,
456 struct device
*dev
, unsigned short pnum
,
459 struct sock
*s
, *result
= NULL
;
462 unsigned short hnum
= ntohs(num
);
463 unsigned short hpnum
= ntohs(pnum
);
466 if(dev
&& dev
->ip_ptr
) {
467 struct in_device
*idev
= dev
->ip_ptr
;
470 paddr
= idev
->ifa_list
->ifa_local
;
473 /* We must obtain the sockhash lock here, we are always
476 SOCKHASH_LOCK_READ_BH();
478 struct tcp_bind_bucket
*tb
= tcp_bhash
[tcp_bhashfn(hnum
)];
479 for( ; (tb
&& tb
->port
!= hnum
); tb
= tb
->next
)
486 for(; s
; s
= s
->bind_next
) {
489 if((s
->num
!= hpnum
|| s
->rcv_saddr
!= paddr
) &&
490 (s
->num
!= hnum
|| s
->rcv_saddr
!= laddr
))
495 if(s
->daddr
!= raddr
)
504 if(s
->bound_dev_if
) {
505 if(s
->bound_dev_if
!= dif
)
509 if(score
== 4 && s
->num
== hnum
) {
512 } else if(score
> badness
&& (s
->num
== hpnum
|| s
->rcv_saddr
)) {
519 struct tcp_bind_bucket
*tb
= tcp_bhash
[tcp_bhashfn(hpnum
)];
520 for( ; (tb
&& tb
->port
!= hpnum
); tb
= tb
->next
)
528 SOCKHASH_UNLOCK_READ_BH();
531 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
533 static inline __u32
tcp_v4_init_sequence(struct sock
*sk
, struct sk_buff
*skb
)
535 return secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
540 /* Check that a TCP address is unique, don't allow multiple
541 * connects to/from the same address. Actually we can optimize
542 * quite a bit, since the socket about to connect is still
543 * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
544 * use will exist, with a NULL owners list. So check for that.
545 * The good_socknum and verify_bind scheme we use makes this
548 static int tcp_v4_unique_address(struct sock
*sk
)
550 struct tcp_bind_bucket
*tb
;
551 unsigned short snum
= sk
->num
;
554 /* Freeze the hash while we snoop around. */
555 SOCKHASH_LOCK_READ();
556 tb
= tcp_bhash
[tcp_bhashfn(snum
)];
557 for(; tb
; tb
= tb
->next
) {
558 if(tb
->port
== snum
&& tb
->owners
!= NULL
) {
559 /* Almost certainly the re-use port case, search the real hashes
560 * so it actually scales.
562 sk
= __tcp_v4_lookup(sk
->daddr
, sk
->dport
,
563 sk
->rcv_saddr
, snum
, sk
->bound_dev_if
);
564 SOCKHASH_UNLOCK_READ();
566 if((sk
!= NULL
) && (sk
->state
!= TCP_LISTEN
))
571 SOCKHASH_UNLOCK_READ();
575 /* This will initiate an outgoing connection. */
576 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
578 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
579 struct sockaddr_in
*usin
= (struct sockaddr_in
*) uaddr
;
580 struct sk_buff
*buff
;
585 if (sk
->state
!= TCP_CLOSE
)
588 /* Don't allow a double connect. */
592 if (addr_len
< sizeof(struct sockaddr_in
))
595 if (usin
->sin_family
!= AF_INET
) {
596 static int complained
;
597 if (usin
->sin_family
)
598 return(-EAFNOSUPPORT
);
600 printk(KERN_DEBUG
"%s forgot to set AF_INET in " __FUNCTION__
"\n", current
->comm
);
603 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
604 if (sk
->opt
&& sk
->opt
->srr
) {
607 nexthop
= sk
->opt
->faddr
;
610 tmp
= ip_route_connect(&rt
, nexthop
, sk
->saddr
,
611 RT_TOS(sk
->ip_tos
)|RTO_CONN
|sk
->localroute
, sk
->bound_dev_if
);
615 if (rt
->rt_flags
&(RTCF_MULTICAST
|RTCF_BROADCAST
)) {
620 dst_release(xchg(&sk
->dst_cache
, rt
));
622 buff
= sock_wmalloc(sk
, (MAX_HEADER
+ sk
->prot
->max_header
),
628 /* Socket has no identity, so lock_sock() is useless. Also
629 * since state==TCP_CLOSE (checked above) the socket cannot
630 * possibly be in the hashes. TCP hash locking is only
631 * needed while checking quickly for a unique address.
632 * However, the socket does need to be (and is) locked
634 * Perhaps this addresses all of ANK's concerns. 8-) -DaveM
636 sk
->dport
= usin
->sin_port
;
637 sk
->daddr
= rt
->rt_dst
;
638 if (sk
->opt
&& sk
->opt
->srr
)
641 sk
->saddr
= rt
->rt_src
;
642 sk
->rcv_saddr
= sk
->saddr
;
644 if (!tcp_v4_unique_address(sk
)) {
647 return -EADDRNOTAVAIL
;
650 tp
->write_seq
= secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
651 sk
->sport
, usin
->sin_port
);
653 tp
->ext_header_len
= 0;
655 tp
->ext_header_len
= sk
->opt
->optlen
;
657 /* Reset mss clamp */
660 if (!ip_dont_fragment(sk
, &rt
->u
.dst
) &&
661 rt
->u
.dst
.pmtu
> 576 && rt
->rt_dst
!= rt
->rt_gateway
) {
662 /* Clamp mss at maximum of 536 and user_mss.
663 Probably, user ordered to override tiny segment size
666 tp
->mss_clamp
= max(tp
->user_mss
, 536);
669 tcp_connect(sk
, buff
, rt
->u
.dst
.pmtu
);
673 static int tcp_v4_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
675 int retval
= -EINVAL
;
677 /* Do sanity checking for sendmsg/sendto/send. */
678 if (msg
->msg_flags
& ~(MSG_OOB
|MSG_DONTROUTE
|MSG_DONTWAIT
|MSG_NOSIGNAL
))
681 struct sockaddr_in
*addr
=(struct sockaddr_in
*)msg
->msg_name
;
683 if (msg
->msg_namelen
< sizeof(*addr
))
685 if (addr
->sin_family
&& addr
->sin_family
!= AF_INET
)
688 if(sk
->state
== TCP_CLOSE
)
691 if (addr
->sin_port
!= sk
->dport
)
693 if (addr
->sin_addr
.s_addr
!= sk
->daddr
)
696 retval
= tcp_do_sendmsg(sk
, msg
);
704 * Do a linear search in the socket open_request list.
705 * This should be replaced with a global hash table.
707 static struct open_request
*tcp_v4_search_req(struct tcp_opt
*tp
,
710 struct open_request
**prevp
)
712 struct open_request
*req
, *prev
;
713 __u16 rport
= th
->source
;
715 /* assumption: the socket is not in use.
716 * as we checked the user count on tcp_rcv and we're
717 * running from a soft interrupt.
719 prev
= (struct open_request
*) (&tp
->syn_wait_queue
);
720 for (req
= prev
->dl_next
; req
; req
= req
->dl_next
) {
721 if (req
->af
.v4_req
.rmt_addr
== iph
->saddr
&&
722 req
->af
.v4_req
.loc_addr
== iph
->daddr
&&
723 req
->rmt_port
== rport
724 #ifdef CONFIG_IP_TRANSPARENT_PROXY
725 && req
->lcl_port
== th
->dest
738 * This routine does path mtu discovery as defined in RFC1191.
740 static inline void do_pmtu_discovery(struct sock
*sk
, struct iphdr
*ip
, unsigned mtu
)
742 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
744 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
745 * send out by Linux are always <576bytes so they should go through
748 if (sk
->state
== TCP_LISTEN
)
752 if(sk
->lock
.users
!= 0)
755 /* We don't check in the destentry if pmtu discovery is forbidden
756 * on this route. We just assume that no packet_to_big packets
757 * are send back when pmtu discovery is not active.
758 * There is a small race when the user changes this flag in the
759 * route, but I think that's acceptable.
761 if (sk
->dst_cache
== NULL
)
764 ip_rt_update_pmtu(sk
->dst_cache
, mtu
);
765 if (sk
->ip_pmtudisc
!= IP_PMTUDISC_DONT
&&
766 tp
->pmtu_cookie
> sk
->dst_cache
->pmtu
) {
767 tcp_sync_mss(sk
, sk
->dst_cache
->pmtu
);
769 /* Resend the TCP packet because it's
770 * clear that the old packet has been
771 * dropped. This is the new "fast" path mtu
774 tcp_simple_retransmit(sk
);
775 } /* else let the usual retransmit timer handle it */
781 * This routine is called by the ICMP module when it gets some
782 * sort of error condition. If err < 0 then the socket should
783 * be closed and the error returned to the user. If err > 0
784 * it's just the icmp type << 8 | icmp code. After adjustment
785 * header points to the first 8 bytes of the tcp header. We need
786 * to find the appropriate port.
788 * The locking strategy used here is very "optimistic". When
789 * someone else accesses the socket the ICMP is just dropped
790 * and for some paths there is no check at all.
791 * A more general error queue to queue errors for later handling
792 * is probably better.
794 * sk->err and sk->err_soft should be atomic_t.
797 void tcp_v4_err(struct sk_buff
*skb
, unsigned char *dp
, int len
)
799 struct iphdr
*iph
= (struct iphdr
*)dp
;
802 int type
= skb
->h
.icmph
->type
;
803 int code
= skb
->h
.icmph
->code
;
804 #if ICMP_MIN_LENGTH < 14
813 if (len
< (iph
->ihl
<< 2) + ICMP_MIN_LENGTH
) {
814 icmp_statistics
.IcmpInErrors
++;
817 #if ICMP_MIN_LENGTH < 14
818 if (len
< (iph
->ihl
<< 2) + 14)
822 th
= (struct tcphdr
*)(dp
+(iph
->ihl
<<2));
824 sk
= tcp_v4_lookup(iph
->daddr
, th
->dest
, iph
->saddr
, th
->source
, skb
->dev
->ifindex
);
825 if (sk
== NULL
|| sk
->state
== TCP_TIME_WAIT
) {
826 icmp_statistics
.IcmpInErrors
++;
830 tp
= &sk
->tp_pinfo
.af_tcp
;
831 seq
= ntohl(th
->seq
);
832 if (sk
->state
!= TCP_LISTEN
&& !between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
833 net_statistics
.OutOfWindowIcmps
++;
838 case ICMP_SOURCE_QUENCH
:
839 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
840 tp
->snd_ssthresh
= tcp_recalc_ssthresh(tp
);
841 tp
->snd_cwnd
= tp
->snd_ssthresh
;
842 tp
->snd_cwnd_cnt
= 0;
843 tp
->high_seq
= tp
->snd_nxt
;
846 case ICMP_PARAMETERPROB
:
849 case ICMP_DEST_UNREACH
:
850 if (code
> NR_ICMP_UNREACH
)
853 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
854 do_pmtu_discovery(sk
, iph
, ntohs(skb
->h
.icmph
->un
.frag
.mtu
));
858 err
= icmp_err_convert
[code
].errno
;
860 case ICMP_TIME_EXCEEDED
:
868 struct open_request
*req
, *prev
;
870 /* The final ACK of the handshake should be already
871 * handled in the new socket context, not here.
872 * Strictly speaking - an ICMP error for the final
873 * ACK should set the opening flag, but that is too
874 * complicated right now.
876 if (!no_flags
&& !th
->syn
&& !th
->ack
)
879 /* Prevent race conditions with accept() -
880 * ICMP is unreliable.
883 if (sk
->lock
.users
!= 0) {
884 net_statistics
.LockDroppedIcmps
++;
885 /* If too many ICMPs get dropped on busy
886 * servers this needs to be solved differently.
891 req
= tcp_v4_search_req(tp
, iph
, th
, &prev
);
894 if (seq
!= req
->snt_isn
) {
895 net_statistics
.OutOfWindowIcmps
++;
900 * Already in ESTABLISHED and a big socket is created,
901 * set error code there.
902 * The error will _not_ be reported in the accept(),
903 * but only with the next operation on the socket after
910 * Still in SYN_RECV, just remove it silently.
911 * There is no good way to pass the error to the newly
912 * created socket, and POSIX does not want network
913 * errors returned from accept().
916 tcp_synq_unlink(tp
, req
, prev
);
917 req
->class->destructor(req
);
918 tcp_openreq_free(req
);
925 case TCP_SYN_RECV
: /* Cannot happen */
926 if (!no_flags
&& !th
->syn
)
928 tcp_statistics
.TcpAttemptFails
++;
932 sk
->error_report(sk
);
936 /* If we've already connected we will keep trying
937 * until we time out, or the user gives up.
939 * rfc1122 4.2.3.9 allows to consider as hard errors
940 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
941 * but it is obsoleted by pmtu discovery).
943 * Note, that in modern internet, where routing is unreliable
944 * and in each dark corner broken firewalls sit, sending random
945 * errors ordered by their masters even this two messages finally lose
946 * their original sense (even Linux sends invalid PORT_UNREACHs)
948 * Now we are in compliance with RFCs.
952 if (sk
->ip_recverr
) {
953 /* This code isn't serialized with the socket code */
954 /* ANK (980927) ... which is harmless now,
955 sk->err's may be safely lost.
959 sk
->error_report(sk
); /* Wake people up to see the error (see connect in sock.c) */
960 } else { /* Only an error on timeout */
966 /* This routine computes an IPv4 TCP checksum. */
967 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
971 th
->check
= tcp_v4_check(th
, len
, sk
->saddr
, sk
->daddr
,
972 csum_partial((char *)th
, th
->doff
<<2, skb
->csum
));
976 * This routine will send an RST to the other tcp.
978 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
980 * Answer: if a packet caused RST, it is not for a socket
981 * existing in our system, if it is matched to a socket,
982 * it is just duplicate segment or bug in other side's TCP.
983 * So that we build reply only basing on parameters
984 * arrived with segment.
985 * Exception: precedence violation. We do not implement it in any case.
988 static void tcp_v4_send_reset(struct sk_buff
*skb
)
990 struct tcphdr
*th
= skb
->h
.th
;
992 struct ip_reply_arg arg
;
994 /* Never send a reset in response to a reset. */
998 if (((struct rtable
*)skb
->dst
)->rt_type
!= RTN_LOCAL
) {
999 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1000 if (((struct rtable
*)skb
->dst
)->rt_type
== RTN_UNICAST
)
1001 icmp_send(skb
, ICMP_DEST_UNREACH
,
1002 ICMP_PORT_UNREACH
, 0);
1007 /* Swap the send and the receive. */
1008 memset(&rth
, 0, sizeof(struct tcphdr
));
1009 rth
.dest
= th
->source
;
1010 rth
.source
= th
->dest
;
1011 rth
.doff
= sizeof(struct tcphdr
)/4;
1015 rth
.seq
= th
->ack_seq
;
1018 rth
.ack_seq
= th
->syn
? htonl(ntohl(th
->seq
)+1) : th
->seq
;
1021 memset(&arg
, 0, sizeof arg
);
1022 arg
.iov
[0].iov_base
= (unsigned char *)&rth
;
1023 arg
.iov
[0].iov_len
= sizeof rth
;
1024 arg
.csum
= csum_tcpudp_nofold(skb
->nh
.iph
->daddr
,
1025 skb
->nh
.iph
->saddr
, /*XXX*/
1026 sizeof(struct tcphdr
),
1030 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
1032 ip_send_reply(tcp_socket
->sk
, skb
, &arg
, sizeof rth
);
1034 tcp_statistics
.TcpOutSegs
++;
1035 tcp_statistics
.TcpOutRsts
++;
1038 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1041 Seems, I never wrote nothing more stupid.
1042 I hope Gods will forgive me, but I cannot forgive myself 8)
1046 static struct sock
*tcp_v4_search_proxy_openreq(struct sk_buff
*skb
)
1048 struct iphdr
*iph
= skb
->nh
.iph
;
1049 struct tcphdr
*th
= (struct tcphdr
*)(skb
->nh
.raw
+ iph
->ihl
*4);
1050 struct sock
*sk
= NULL
;
1053 SOCKHASH_LOCK_READ();
1054 for (i
=0; i
<TCP_LHTABLE_SIZE
; i
++) {
1055 for(sk
= tcp_listening_hash
[i
]; sk
; sk
= sk
->next
) {
1056 struct open_request
*dummy
;
1057 if (tcp_v4_search_req(&sk
->tp_pinfo
.af_tcp
, iph
,
1059 (!sk
->bound_dev_if
||
1060 sk
->bound_dev_if
== skb
->dev
->ifindex
))
1065 SOCKHASH_UNLOCK_READ();
1070 * Check whether a received TCP packet might be for one of our
1074 int tcp_chkaddr(struct sk_buff
*skb
)
1076 struct iphdr
*iph
= skb
->nh
.iph
;
1077 struct tcphdr
*th
= (struct tcphdr
*)(skb
->nh
.raw
+ iph
->ihl
*4);
1080 sk
= tcp_v4_lookup(iph
->saddr
, th
->source
, iph
->daddr
,
1081 th
->dest
, skb
->dev
->ifindex
);
1084 return tcp_v4_search_proxy_openreq(skb
) != NULL
;
1086 if (sk
->state
== TCP_LISTEN
) {
1087 struct open_request
*dummy
;
1088 if (tcp_v4_search_req(&sk
->tp_pinfo
.af_tcp
, skb
->nh
.iph
,
1090 (!sk
->bound_dev_if
||
1091 sk
->bound_dev_if
== skb
->dev
->ifindex
))
1095 /* 0 means accept all LOCAL addresses here, not all the world... */
1097 if (sk
->rcv_saddr
== 0)
1105 * Send a SYN-ACK after having received an ACK.
1106 * This still operates on a open_request only, not on a big
1109 static void tcp_v4_send_synack(struct sock
*sk
, struct open_request
*req
)
1112 struct ip_options
*opt
;
1113 struct sk_buff
* skb
;
1116 /* First, grab a route. */
1117 opt
= req
->af
.v4_req
.opt
;
1118 if(ip_route_output(&rt
, ((opt
&& opt
->srr
) ?
1120 req
->af
.v4_req
.rmt_addr
),
1121 req
->af
.v4_req
.loc_addr
,
1122 RT_TOS(sk
->ip_tos
) | RTO_CONN
| sk
->localroute
,
1123 sk
->bound_dev_if
)) {
1124 ip_statistics
.IpOutNoRoutes
++;
1127 if(opt
&& opt
->is_strictroute
&& rt
->rt_dst
!= rt
->rt_gateway
) {
1129 ip_statistics
.IpOutNoRoutes
++;
1133 mss
= rt
->u
.dst
.pmtu
- sizeof(struct iphdr
) - sizeof(struct tcphdr
);
1135 skb
= tcp_make_synack(sk
, &rt
->u
.dst
, req
, mss
);
1137 struct tcphdr
*th
= skb
->h
.th
;
1139 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1140 th
->source
= req
->lcl_port
; /* LVE */
1143 th
->check
= tcp_v4_check(th
, skb
->len
,
1144 req
->af
.v4_req
.loc_addr
, req
->af
.v4_req
.rmt_addr
,
1145 csum_partial((char *)th
, skb
->len
, skb
->csum
));
1147 ip_build_and_send_pkt(skb
, sk
, req
->af
.v4_req
.loc_addr
,
1148 req
->af
.v4_req
.rmt_addr
, req
->af
.v4_req
.opt
);
1154 * IPv4 open_request destructor.
1156 static void tcp_v4_or_free(struct open_request
*req
)
1158 if(!req
->sk
&& req
->af
.v4_req
.opt
)
1159 kfree_s(req
->af
.v4_req
.opt
, optlength(req
->af
.v4_req
.opt
));
1162 static inline void syn_flood_warning(struct sk_buff
*skb
)
1164 static unsigned long warntime
;
1166 if (jiffies
- warntime
> HZ
*60) {
1169 "possible SYN flooding on port %d. Sending cookies.\n",
1170 ntohs(skb
->h
.th
->dest
));
1175 * Save and compile IPv4 options into the open_request if needed.
1177 static inline struct ip_options
*
1178 tcp_v4_save_options(struct sock
*sk
, struct sk_buff
*skb
)
1180 struct ip_options
*opt
= &(IPCB(skb
)->opt
);
1181 struct ip_options
*dopt
= NULL
;
1183 if (opt
&& opt
->optlen
) {
1184 int opt_size
= optlength(opt
);
1185 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
1187 if (ip_options_echo(dopt
, skb
)) {
1188 kfree_s(dopt
, opt_size
);
1197 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1198 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1199 * It would be better to replace it with a global counter for all sockets
1200 * but then some measure against one socket starving all other sockets
1203 int sysctl_max_syn_backlog
= 128;
1205 struct or_calltable or_ipv4
= {
1211 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1212 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1214 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
, __u32 isn
)
1217 struct open_request
*req
;
1218 struct tcphdr
*th
= skb
->h
.th
;
1219 __u32 saddr
= skb
->nh
.iph
->saddr
;
1220 __u32 daddr
= skb
->nh
.iph
->daddr
;
1221 #ifdef CONFIG_SYN_COOKIES
1222 int want_cookie
= 0;
1224 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1227 /* If the socket is dead, don't accept the connection. */
1231 /* Never answer to SYNs send to broadcast or multicast */
1232 if (((struct rtable
*)skb
->dst
)->rt_flags
&
1233 (RTCF_BROADCAST
|RTCF_MULTICAST
))
1236 /* XXX: Check against a global syn pool counter. */
1237 if (BACKLOG(sk
) > BACKLOGMAX(sk
)) {
1238 #ifdef CONFIG_SYN_COOKIES
1239 if (sysctl_tcp_syncookies
) {
1240 syn_flood_warning(skb
);
1247 isn
= tcp_v4_init_sequence(sk
, skb
);
1251 req
= tcp_openreq_alloc();
1256 req
->rcv_wnd
= 0; /* So that tcp_send_synack() knows! */
1258 req
->rcv_isn
= TCP_SKB_CB(skb
)->seq
;
1259 tp
.tstamp_ok
= tp
.sack_ok
= tp
.wscale_ok
= tp
.snd_wscale
= 0;
1261 tp
.mss_clamp
= 65535;
1262 tcp_parse_options(NULL
, th
, &tp
, want_cookie
);
1263 if (tp
.mss_clamp
== 65535)
1264 tp
.mss_clamp
= 576 - sizeof(struct iphdr
) - sizeof(struct iphdr
);
1266 if (sk
->tp_pinfo
.af_tcp
.user_mss
&& sk
->tp_pinfo
.af_tcp
.user_mss
< tp
.mss_clamp
)
1267 tp
.mss_clamp
= sk
->tp_pinfo
.af_tcp
.user_mss
;
1268 req
->mss
= tp
.mss_clamp
;
1271 req
->ts_recent
= tp
.rcv_tsval
;
1272 req
->tstamp_ok
= tp
.tstamp_ok
;
1273 req
->sack_ok
= tp
.sack_ok
;
1274 req
->snd_wscale
= tp
.snd_wscale
;
1275 req
->wscale_ok
= tp
.wscale_ok
;
1276 req
->rmt_port
= th
->source
;
1277 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1278 req
->lcl_port
= th
->dest
; /* LVE */
1280 req
->af
.v4_req
.loc_addr
= daddr
;
1281 req
->af
.v4_req
.rmt_addr
= saddr
;
1283 /* Note that we ignore the isn passed from the TIME_WAIT
1284 * state here. That's the price we pay for cookies.
1287 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1291 req
->af
.v4_req
.opt
= tcp_v4_save_options(sk
, skb
);
1293 req
->class = &or_ipv4
;
1297 tcp_v4_send_synack(sk
, req
);
1300 if (req
->af
.v4_req
.opt
)
1301 kfree(req
->af
.v4_req
.opt
);
1302 tcp_v4_or_free(req
);
1303 tcp_openreq_free(req
);
1305 req
->expires
= jiffies
+ TCP_TIMEOUT_INIT
;
1306 tcp_inc_slow_timer(TCP_SLT_SYNACK
);
1307 tcp_synq_queue(&sk
->tp_pinfo
.af_tcp
, req
);
1313 SOCK_DEBUG(sk
, "Reset on %p: Connect on dead socket.\n",sk
);
1314 tcp_statistics
.TcpAttemptFails
++;
1315 return -ENOTCONN
; /* send reset */
1321 tcp_statistics
.TcpAttemptFails
++;
1325 /* This is not only more efficient than what we used to do, it eliminates
1326 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
1328 * This function wants to be moved to a common for IPv[46] file. --ANK
1330 struct sock
*tcp_create_openreq_child(struct sock
*sk
, struct open_request
*req
, struct sk_buff
*skb
)
1332 struct sock
*newsk
= sk_alloc(PF_INET
, GFP_ATOMIC
, 0);
1335 struct tcp_opt
*newtp
;
1336 #ifdef CONFIG_FILTER
1337 struct sk_filter
*filter
;
1340 memcpy(newsk
, sk
, sizeof(*newsk
));
1341 newsk
->state
= TCP_SYN_RECV
;
1343 /* Clone the TCP header template */
1344 newsk
->dport
= req
->rmt_port
;
1346 sock_lock_init(newsk
);
1348 atomic_set(&newsk
->rmem_alloc
, 0);
1349 skb_queue_head_init(&newsk
->receive_queue
);
1350 atomic_set(&newsk
->wmem_alloc
, 0);
1351 skb_queue_head_init(&newsk
->write_queue
);
1352 atomic_set(&newsk
->omem_alloc
, 0);
1356 newsk
->backlog
.head
= newsk
->backlog
.tail
= NULL
;
1357 skb_queue_head_init(&newsk
->error_queue
);
1358 newsk
->write_space
= tcp_write_space
;
1359 #ifdef CONFIG_FILTER
1360 if ((filter
= newsk
->filter
) != NULL
)
1361 sk_filter_charge(newsk
, filter
);
1364 /* Now setup tcp_opt */
1365 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
1366 newtp
->pred_flags
= 0;
1367 newtp
->rcv_nxt
= req
->rcv_isn
+ 1;
1368 newtp
->snd_nxt
= req
->snt_isn
+ 1;
1369 newtp
->snd_una
= req
->snt_isn
+ 1;
1372 newtp
->snd_wl1
= req
->rcv_isn
;
1373 newtp
->snd_wl2
= req
->snt_isn
;
1375 /* RFC1323: The window in SYN & SYN/ACK segments
1378 newtp
->snd_wnd
= ntohs(skb
->h
.th
->window
);
1380 newtp
->max_window
= newtp
->snd_wnd
;
1382 newtp
->retransmits
= 0;
1383 newtp
->last_ack_sent
= req
->rcv_isn
+ 1;
1385 newtp
->mdev
= TCP_TIMEOUT_INIT
;
1387 /* So many TCP implementations out there (incorrectly) count the
1388 * initial SYN frame in their delayed-ACK and congestion control
1389 * algorithms that we must have the following bandaid to talk
1390 * efficiently to them. -DaveM
1392 newtp
->snd_cwnd
= 2;
1394 newtp
->rto
= TCP_TIMEOUT_INIT
;
1395 newtp
->packets_out
= 0;
1396 newtp
->fackets_out
= 0;
1397 newtp
->retrans_out
= 0;
1398 newtp
->high_seq
= 0;
1399 newtp
->snd_ssthresh
= 0x7fffffff;
1400 newtp
->snd_cwnd_cnt
= 0;
1401 newtp
->dup_acks
= 0;
1402 newtp
->delayed_acks
= 0;
1403 init_timer(&newtp
->retransmit_timer
);
1404 newtp
->retransmit_timer
.function
= &tcp_retransmit_timer
;
1405 newtp
->retransmit_timer
.data
= (unsigned long) newsk
;
1406 init_timer(&newtp
->delack_timer
);
1407 newtp
->delack_timer
.function
= &tcp_delack_timer
;
1408 newtp
->delack_timer
.data
= (unsigned long) newsk
;
1409 skb_queue_head_init(&newtp
->out_of_order_queue
);
1410 newtp
->send_head
= newtp
->retrans_head
= NULL
;
1411 newtp
->rcv_wup
= req
->rcv_isn
+ 1;
1412 newtp
->write_seq
= req
->snt_isn
+ 1;
1413 newtp
->copied_seq
= req
->rcv_isn
+ 1;
1415 newtp
->saw_tstamp
= 0;
1416 newtp
->mss_clamp
= req
->mss
;
1418 init_timer(&newtp
->probe_timer
);
1419 newtp
->probe_timer
.function
= &tcp_probe_timer
;
1420 newtp
->probe_timer
.data
= (unsigned long) newsk
;
1421 newtp
->probes_out
= 0;
1422 newtp
->syn_seq
= req
->rcv_isn
;
1423 newtp
->fin_seq
= req
->rcv_isn
;
1424 newtp
->urg_data
= 0;
1425 tcp_synq_init(newtp
);
1426 newtp
->syn_backlog
= 0;
1427 if (skb
->len
>= 536)
1428 newtp
->last_seg_size
= skb
->len
;
1430 /* Back to base struct sock members. */
1432 newsk
->ack_backlog
= 0;
1433 newsk
->max_ack_backlog
= SOMAXCONN
;
1434 newsk
->priority
= 0;
1436 /* IP layer stuff */
1438 init_timer(&newsk
->timer
);
1439 newsk
->timer
.function
= &net_timer
;
1440 newsk
->timer
.data
= (unsigned long) newsk
;
1441 newsk
->socket
= NULL
;
1443 newtp
->tstamp_ok
= req
->tstamp_ok
;
1444 if((newtp
->sack_ok
= req
->sack_ok
) != 0)
1445 newtp
->num_sacks
= 0;
1446 newtp
->window_clamp
= req
->window_clamp
;
1447 newtp
->rcv_wnd
= req
->rcv_wnd
;
1448 newtp
->wscale_ok
= req
->wscale_ok
;
1449 if (newtp
->wscale_ok
) {
1450 newtp
->snd_wscale
= req
->snd_wscale
;
1451 newtp
->rcv_wscale
= req
->rcv_wscale
;
1453 newtp
->snd_wscale
= newtp
->rcv_wscale
= 0;
1454 newtp
->window_clamp
= min(newtp
->window_clamp
,65535);
1456 if (newtp
->tstamp_ok
) {
1457 newtp
->ts_recent
= req
->ts_recent
;
1458 newtp
->ts_recent_stamp
= tcp_time_stamp
;
1459 newtp
->tcp_header_len
= sizeof(struct tcphdr
) + TCPOLEN_TSTAMP_ALIGNED
;
1461 newtp
->tcp_header_len
= sizeof(struct tcphdr
);
1468 * The three way handshake has completed - we got a valid synack -
1469 * now create the new socket.
1471 struct sock
* tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1472 struct open_request
*req
,
1473 struct dst_entry
*dst
)
1475 struct ip_options
*opt
= req
->af
.v4_req
.opt
;
1476 struct tcp_opt
*newtp
;
1479 if (sk
->ack_backlog
> sk
->max_ack_backlog
)
1480 goto exit
; /* head drop */
1484 if (ip_route_output(&rt
,
1485 opt
&& opt
->srr
? opt
->faddr
: req
->af
.v4_req
.rmt_addr
,
1486 req
->af
.v4_req
.loc_addr
, sk
->ip_tos
|RTO_CONN
, 0))
1490 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1491 /* The new socket created for transparent proxy may fall
1492 * into a non-existed bind bucket because sk->num != newsk->num.
1493 * Ensure existance of the bucket now. The placement of the check
1494 * later will require to destroy just created newsk in the case of fail.
1495 * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
1497 if (tcp_bucket_check(ntohs(skb
->h
.th
->dest
)))
1501 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1505 sk
->tp_pinfo
.af_tcp
.syn_backlog
--;
1508 newsk
->dst_cache
= dst
;
1510 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
1511 newsk
->daddr
= req
->af
.v4_req
.rmt_addr
;
1512 newsk
->saddr
= req
->af
.v4_req
.loc_addr
;
1513 newsk
->rcv_saddr
= req
->af
.v4_req
.loc_addr
;
1514 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1515 newsk
->num
= ntohs(skb
->h
.th
->dest
);
1516 newsk
->sport
= req
->lcl_port
;
1518 newsk
->opt
= req
->af
.v4_req
.opt
;
1519 newtp
->ext_header_len
= 0;
1521 newtp
->ext_header_len
= newsk
->opt
->optlen
;
1523 tcp_sync_mss(newsk
, dst
->pmtu
);
1524 newtp
->rcv_mss
= newtp
->mss_clamp
;
1526 /* It would be better to use newtp->mss_clamp here */
1527 if (newsk
->rcvbuf
< (3 * newtp
->pmtu_cookie
))
1528 newsk
->rcvbuf
= min ((3 * newtp
->pmtu_cookie
), sysctl_rmem_max
);
1529 if (newsk
->sndbuf
< (3 * newtp
->pmtu_cookie
))
1530 newsk
->sndbuf
= min ((3 * newtp
->pmtu_cookie
), sysctl_wmem_max
);
1532 SOCKHASH_LOCK_WRITE();
1533 __tcp_v4_hash(newsk
);
1534 __tcp_inherit_port(sk
, newsk
);
1535 SOCKHASH_UNLOCK_WRITE();
1537 sk
->data_ready(sk
, 0); /* Deliver SIGIO */
1546 static void tcp_v4_rst_req(struct sock
*sk
, struct sk_buff
*skb
)
1548 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1549 struct open_request
*req
, *prev
;
1551 req
= tcp_v4_search_req(tp
,skb
->nh
.iph
, skb
->h
.th
, &prev
);
1554 /* Sequence number check required by RFC793 */
1555 if (before(TCP_SKB_CB(skb
)->seq
, req
->rcv_isn
) ||
1556 after(TCP_SKB_CB(skb
)->seq
, req
->rcv_isn
+1))
1558 tcp_synq_unlink(tp
, req
, prev
);
1559 (req
->sk
? sk
->ack_backlog
: tp
->syn_backlog
)--;
1560 req
->class->destructor(req
);
1561 tcp_openreq_free(req
);
1563 net_statistics
.EmbryonicRsts
++;
1566 /* Check for embryonic sockets (open_requests) We check packets with
1567 * only the SYN bit set against the open_request queue too: This
1568 * increases connection latency a bit, but is required to detect
1569 * retransmitted SYNs.
1571 static inline struct sock
*tcp_v4_hnd_req(struct sock
*sk
,struct sk_buff
*skb
)
1573 struct tcphdr
*th
= skb
->h
.th
;
1574 u32 flg
= ((u32
*)th
)[3];
1577 if (flg
& __constant_htonl(0x00040000)) {
1578 tcp_v4_rst_req(sk
, skb
);
1582 /* Check for SYN|ACK */
1583 flg
&= __constant_htonl(0x00120000);
1585 struct open_request
*req
, *dummy
;
1586 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1588 /* Find possible connection requests. */
1589 req
= tcp_v4_search_req(tp
, skb
->nh
.iph
, th
, &dummy
);
1591 sk
= tcp_check_req(sk
, skb
, req
);
1593 #ifdef CONFIG_SYN_COOKIES
1595 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1602 /* The socket must have it's spinlock held when we get
1605 * We have a potential double-lock case here, so even when
1606 * doing backlog processing we use the BH locking scheme.
1607 * This is because we cannot sleep with the original spinlock
1610 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1612 int need_unlock
= 0;
1613 #ifdef CONFIG_FILTER
1614 struct sk_filter
*filter
= sk
->filter
;
1615 if (filter
&& sk_filter(skb
, filter
))
1617 #endif /* CONFIG_FILTER */
1620 * This doesn't check if the socket has enough room for the packet.
1621 * Either process the packet _without_ queueing it and then free it,
1622 * or do the check later.
1624 skb_set_owner_r(skb
, sk
);
1626 if (sk
->state
== TCP_ESTABLISHED
) { /* Fast path */
1627 if (tcp_rcv_established(sk
, skb
, skb
->h
.th
, skb
->len
))
1632 if (sk
->state
== TCP_LISTEN
) {
1635 nsk
= tcp_v4_hnd_req(sk
, skb
);
1640 * Queue it on the new socket if the new socket is active,
1641 * otherwise we just shortcircuit this and continue with
1646 if (nsk
->lock
.users
!= 0) {
1648 sk_add_backlog(nsk
, skb
);
1649 bh_unlock_sock(nsk
);
1657 if (tcp_rcv_state_process(sk
, skb
, skb
->h
.th
, skb
->len
))
1659 goto out_maybe_unlock
;
1662 tcp_v4_send_reset(skb
);
1665 /* Be careful here. If this function gets more complicated and
1666 * gcc suffers from register pressure on the x86, sk (in %ebx)
1667 * might be destroyed here. This current version compiles correctly,
1668 * but you have been warned.
1680 int tcp_v4_rcv(struct sk_buff
*skb
, unsigned short len
)
1686 if (skb
->pkt_type
!=PACKET_HOST
)
1691 /* Pull up the IP header. */
1692 __skb_pull(skb
, skb
->h
.raw
- skb
->data
);
1694 /* Count it even if it's bad */
1695 tcp_statistics
.TcpInSegs
++;
1697 if (len
< sizeof(struct tcphdr
))
1700 /* Try to use the device checksum if provided. */
1701 switch (skb
->ip_summed
) {
1703 skb
->csum
= csum_partial((char *)th
, len
, 0);
1705 if (tcp_v4_check(th
,len
,skb
->nh
.iph
->saddr
,skb
->nh
.iph
->daddr
,skb
->csum
)) {
1706 NETDEBUG(printk(KERN_DEBUG
"TCPv4 bad checksum "
1707 "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1709 NIPQUAD(skb
->nh
.iph
->saddr
),
1711 NIPQUAD(skb
->nh
.iph
->daddr
),
1714 ntohs(skb
->nh
.iph
->tot_len
)));
1716 tcp_statistics
.TcpInErrs
++;
1720 /* CHECKSUM_UNNECESSARY */
1723 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1724 if (IPCB(skb
)->redirport
)
1725 sk
= tcp_v4_proxy_lookup(th
->dest
, skb
->nh
.iph
->saddr
, th
->source
,
1726 skb
->nh
.iph
->daddr
, skb
->dev
,
1727 IPCB(skb
)->redirport
, skb
->dev
->ifindex
);
1730 SOCKHASH_LOCK_READ_BH();
1731 sk
= __tcp_v4_lookup(skb
->nh
.iph
->saddr
, th
->source
,
1732 skb
->nh
.iph
->daddr
, th
->dest
, skb
->dev
->ifindex
);
1733 SOCKHASH_UNLOCK_READ_BH();
1734 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1736 sk
= tcp_v4_search_proxy_openreq(skb
);
1741 if(!ipsec_sk_policy(sk
,skb
))
1744 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1745 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1747 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1751 if (sk
->state
== TCP_TIME_WAIT
)
1756 if (!sk
->lock
.users
)
1757 ret
= tcp_v4_do_rcv(sk
, skb
);
1759 sk_add_backlog(sk
, skb
);
1765 tcp_v4_send_reset(skb
);
1768 /* Discard frame. */
1773 if(tcp_timewait_state_process((struct tcp_tw_bucket
*)sk
,
1779 static void __tcp_v4_rehash(struct sock
*sk
)
1781 struct sock
**skp
= &tcp_ehash
[(sk
->hashent
= tcp_sk_hashfn(sk
))];
1783 SOCKHASH_LOCK_WRITE();
1786 sk
->next
->pprev
= sk
->pprev
;
1787 *sk
->pprev
= sk
->next
;
1791 if((sk
->next
= *skp
) != NULL
)
1792 (*skp
)->pprev
= &sk
->next
;
1795 SOCKHASH_UNLOCK_WRITE();
1798 int tcp_v4_rebuild_header(struct sock
*sk
)
1800 struct rtable
*rt
= (struct rtable
*)sk
->dst_cache
;
1802 int want_rewrite
= sysctl_ip_dynaddr
&& sk
->state
== TCP_SYN_SENT
;
1807 /* Force route checking if want_rewrite.
1808 * The idea is good, the implementation is disguisting.
1809 * Well, if I made bind on this socket, you cannot randomly ovewrite
1810 * its source address. --ANK
1814 struct rtable
*new_rt
;
1815 __u32 old_saddr
= rt
->rt_src
;
1817 /* Query new route using another rt buffer */
1818 tmp
= ip_route_connect(&new_rt
, rt
->rt_dst
, 0,
1819 RT_TOS(sk
->ip_tos
)|sk
->localroute
,
1822 /* Only useful if different source addrs */
1825 * Only useful if different source addrs
1827 if (new_rt
->rt_src
!= old_saddr
) {
1828 dst_release(sk
->dst_cache
);
1829 sk
->dst_cache
= &new_rt
->u
.dst
;
1833 dst_release(&new_rt
->u
.dst
);
1836 if (rt
->u
.dst
.obsolete
) {
1838 err
= ip_route_output(&rt
, rt
->rt_dst
, rt
->rt_src
, rt
->key
.tos
|RTO_CONN
, rt
->key
.oif
);
1841 sk
->error_report(sk
);
1844 dst_release(xchg(&sk
->dst_cache
, &rt
->u
.dst
));
1850 new_saddr
= rt
->rt_src
;
1852 /* Ouch!, this should not happen. */
1853 if (!sk
->saddr
|| !sk
->rcv_saddr
) {
1854 printk(KERN_WARNING
"tcp_v4_rebuild_header(): not valid sock addrs: "
1855 "saddr=%08lX rcv_saddr=%08lX\n",
1857 ntohl(sk
->rcv_saddr
));
1861 if (new_saddr
!= sk
->saddr
) {
1862 if (sysctl_ip_dynaddr
> 1) {
1863 printk(KERN_INFO
"tcp_v4_rebuild_header(): shifting sk->saddr "
1864 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1866 NIPQUAD(new_saddr
));
1869 sk
->saddr
= new_saddr
;
1870 sk
->rcv_saddr
= new_saddr
;
1872 /* XXX The only one ugly spot where we need to
1873 * XXX really change the sockets identity after
1874 * XXX it has entered the hashes. -DaveM
1876 __tcp_v4_rehash(sk
);
1882 static struct sock
* tcp_v4_get_sock(struct sk_buff
*skb
, struct tcphdr
*th
)
1884 return tcp_v4_lookup(skb
->nh
.iph
->saddr
, th
->source
,
1885 skb
->nh
.iph
->daddr
, th
->dest
, skb
->dev
->ifindex
);
1888 static void v4_addr2sockaddr(struct sock
*sk
, struct sockaddr
* uaddr
)
1890 struct sockaddr_in
*sin
= (struct sockaddr_in
*) uaddr
;
1892 sin
->sin_family
= AF_INET
;
1893 sin
->sin_addr
.s_addr
= sk
->daddr
;
1894 sin
->sin_port
= sk
->dport
;
1897 struct tcp_func ipv4_specific
= {
1900 tcp_v4_rebuild_header
,
1901 tcp_v4_conn_request
,
1902 tcp_v4_syn_recv_sock
,
1904 sizeof(struct iphdr
),
1909 sizeof(struct sockaddr_in
)
1912 /* NOTE: A lot of things set to zero explicitly by call to
1913 * sk_alloc() so need not be done here.
1915 static int tcp_v4_init_sock(struct sock
*sk
)
1917 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1919 skb_queue_head_init(&tp
->out_of_order_queue
);
1920 tcp_init_xmit_timers(sk
);
1922 tp
->rto
= TCP_TIMEOUT_INIT
; /*TCP_WRITE_TIME*/
1923 tp
->mdev
= TCP_TIMEOUT_INIT
;
1926 /* So many TCP implementations out there (incorrectly) count the
1927 * initial SYN frame in their delayed-ACK and congestion control
1928 * algorithms that we must have the following bandaid to talk
1929 * efficiently to them. -DaveM
1933 /* See draft-stevens-tcpca-spec-01 for discussion of the
1934 * initialization of these values.
1936 tp
->snd_cwnd_cnt
= 0;
1937 tp
->snd_ssthresh
= 0x7fffffff; /* Infinity */
1939 sk
->state
= TCP_CLOSE
;
1940 sk
->max_ack_backlog
= SOMAXCONN
;
1943 sk
->write_space
= tcp_write_space
;
1945 /* Init SYN queue. */
1948 sk
->tp_pinfo
.af_tcp
.af_specific
= &ipv4_specific
;
1953 static int tcp_v4_destroy_sock(struct sock
*sk
)
1955 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1956 struct sk_buff
*skb
;
1958 tcp_clear_xmit_timers(sk
);
1961 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE
);
1963 /* Cleanup up the write buffer. */
1964 while((skb
= __skb_dequeue(&sk
->write_queue
)) != NULL
)
1967 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1968 while((skb
= __skb_dequeue(&tp
->out_of_order_queue
)) != NULL
)
1971 /* Clean up a referenced TCP bind bucket, this only happens if a
1972 * port is allocated for a socket, but it never fully connects.
1974 if(sk
->prev
!= NULL
)
1980 /* Proc filesystem TCP sock list dumping. */
1981 static void get_openreq(struct sock
*sk
, struct open_request
*req
, char *tmpbuf
, int i
)
1983 sprintf(tmpbuf
, "%4d: %08lX:%04X %08lX:%04X"
1984 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u",
1986 (long unsigned int)req
->af
.v4_req
.loc_addr
,
1988 (long unsigned int)req
->af
.v4_req
.rmt_addr
,
1989 ntohs(req
->rmt_port
),
1991 0,0, /* could print option size, but that is af dependent. */
1992 1, /* timers active (only the expire timer) */
1993 (unsigned long)(req
->expires
- jiffies
),
1995 sk
->socket
? sk
->socket
->inode
->i_uid
: 0,
1996 0, /* non standard timer */
1997 0 /* open_requests have no inode */
2001 static void get_tcp_sock(struct sock
*sp
, char *tmpbuf
, int i
)
2003 unsigned int dest
, src
;
2005 int timer_active
, timer_active1
, timer_active2
;
2006 unsigned long timer_expires
;
2007 struct tcp_opt
*tp
= &sp
->tp_pinfo
.af_tcp
;
2010 src
= sp
->rcv_saddr
;
2011 destp
= ntohs(sp
->dport
);
2012 srcp
= ntohs(sp
->sport
);
2013 timer_active1
= tp
->retransmit_timer
.prev
!= NULL
;
2014 timer_active2
= sp
->timer
.prev
!= NULL
;
2016 timer_expires
= (unsigned) -1;
2017 if (timer_active1
&& tp
->retransmit_timer
.expires
< timer_expires
) {
2019 timer_expires
= tp
->retransmit_timer
.expires
;
2021 if (timer_active2
&& sp
->timer
.expires
< timer_expires
) {
2023 timer_expires
= sp
->timer
.expires
;
2025 if(timer_active
== 0)
2026 timer_expires
= jiffies
;
2028 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2029 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
2030 i
, src
, srcp
, dest
, destp
, sp
->state
,
2031 tp
->write_seq
-tp
->snd_una
, tp
->rcv_nxt
-tp
->copied_seq
,
2032 timer_active
, timer_expires
-jiffies
,
2034 sp
->socket
? sp
->socket
->inode
->i_uid
: 0,
2035 timer_active
? sp
->timeout
: 0,
2036 sp
->socket
? sp
->socket
->inode
->i_ino
: 0);
2039 static void get_timewait_sock(struct tcp_tw_bucket
*tw
, char *tmpbuf
, int i
)
2041 extern int tcp_tw_death_row_slot
;
2042 unsigned int dest
, src
;
2047 src
= tw
->rcv_saddr
;
2048 destp
= ntohs(tw
->dport
);
2049 srcp
= ntohs(tw
->sport
);
2051 slot_dist
= tw
->death_slot
;
2052 if(slot_dist
> tcp_tw_death_row_slot
)
2053 slot_dist
= (TCP_TWKILL_SLOTS
- slot_dist
) + tcp_tw_death_row_slot
;
2055 slot_dist
= tcp_tw_death_row_slot
- slot_dist
;
2057 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2058 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d",
2059 i
, src
, srcp
, dest
, destp
, TCP_TIME_WAIT
, 0, 0,
2060 3, slot_dist
* TCP_TWKILL_PERIOD
, 0, 0, 0, 0);
2063 int tcp_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
2065 int len
= 0, num
= 0, i
;
2066 off_t begin
, pos
= 0;
2070 len
+= sprintf(buffer
, "%-127s\n",
2071 " sl local_address rem_address st tx_queue "
2072 "rx_queue tr tm->when retrnsmt uid timeout inode");
2075 SOCKHASH_LOCK_READ();
2077 /* First, walk listening socket table. */
2078 for(i
= 0; i
< TCP_LHTABLE_SIZE
; i
++) {
2079 struct sock
*sk
= tcp_listening_hash
[i
];
2081 for (sk
= tcp_listening_hash
[i
]; sk
; sk
= sk
->next
, num
++) {
2082 struct open_request
*req
;
2083 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2085 if (sk
->family
!= PF_INET
)
2088 if (pos
>= offset
) {
2089 get_tcp_sock(sk
, tmpbuf
, num
);
2090 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2094 for (req
= tp
->syn_wait_queue
; req
; req
= req
->dl_next
, num
++) {
2100 get_openreq(sk
, req
, tmpbuf
, num
);
2101 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2108 /* Next, walk established hash chain. */
2109 for (i
= 0; i
< (tcp_ehash_size
>> 1); i
++) {
2112 for(sk
= tcp_ehash
[i
]; sk
; sk
= sk
->next
, num
++) {
2113 if (sk
->family
!= PF_INET
)
2118 get_tcp_sock(sk
, tmpbuf
, num
);
2119 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2125 /* Finally, walk time wait buckets. */
2126 for (i
= (tcp_ehash_size
>>1); i
< tcp_ehash_size
; i
++) {
2127 struct tcp_tw_bucket
*tw
;
2128 for (tw
= (struct tcp_tw_bucket
*)tcp_ehash
[i
];
2130 tw
= (struct tcp_tw_bucket
*)tw
->next
, num
++) {
2131 if (tw
->family
!= PF_INET
)
2136 get_timewait_sock(tw
, tmpbuf
, num
);
2137 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2144 SOCKHASH_UNLOCK_READ();
2146 begin
= len
- (pos
- offset
);
2147 *start
= buffer
+ begin
;
2156 struct proto tcp_prot
= {
2157 tcp_close
, /* close */
2158 tcp_v4_connect
, /* connect */
2159 tcp_accept
, /* accept */
2160 NULL
, /* retransmit */
2161 tcp_write_wakeup
, /* write_wakeup */
2162 tcp_read_wakeup
, /* read_wakeup */
2163 tcp_poll
, /* poll */
2164 tcp_ioctl
, /* ioctl */
2165 tcp_v4_init_sock
, /* init */
2166 tcp_v4_destroy_sock
, /* destroy */
2167 tcp_shutdown
, /* shutdown */
2168 tcp_setsockopt
, /* setsockopt */
2169 tcp_getsockopt
, /* getsockopt */
2170 tcp_v4_sendmsg
, /* sendmsg */
2171 tcp_recvmsg
, /* recvmsg */
2173 tcp_v4_do_rcv
, /* backlog_rcv */
2174 tcp_v4_hash
, /* hash */
2175 tcp_v4_unhash
, /* unhash */
2176 tcp_v4_get_port
, /* get_port */
2177 128, /* max_header */
2178 0, /* retransmits */
2181 0 /* highestinuse */
2186 __initfunc(void tcp_v4_init(struct net_proto_family
*ops
))
2190 tcp_inode
.i_mode
= S_IFSOCK
;
2191 tcp_inode
.i_sock
= 1;
2192 tcp_inode
.i_uid
= 0;
2193 tcp_inode
.i_gid
= 0;
2194 init_waitqueue_head(&tcp_inode
.i_wait
);
2195 init_waitqueue_head(&tcp_inode
.u
.socket_i
.wait
);
2197 tcp_socket
->inode
= &tcp_inode
;
2198 tcp_socket
->state
= SS_UNCONNECTED
;
2199 tcp_socket
->type
=SOCK_RAW
;
2201 if ((err
=ops
->create(tcp_socket
, IPPROTO_TCP
))<0)
2202 panic("Failed to create the TCP control socket.\n");
2203 tcp_socket
->sk
->allocation
=GFP_ATOMIC
;
2204 tcp_socket
->sk
->ip_ttl
= MAXTTL
;
2206 /* Unhash it so that IP input processing does not even
2207 * see it, we do not wish this socket to see incoming
2210 tcp_socket
->sk
->prot
->unhash(tcp_socket
->sk
);