2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.76 1997/12/07 04:44:19 freitag Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics (ifdefed by
43 * Juan Jose Ciarlante: ip_dynaddr bits
46 #include <linux/config.h>
47 #include <linux/types.h>
48 #include <linux/fcntl.h>
49 #include <linux/random.h>
50 #include <linux/ipsec.h>
51 #include <linux/inet.h>
57 #include <asm/segment.h>
59 extern int sysctl_tcp_sack
;
60 extern int sysctl_tcp_tsack
;
61 extern int sysctl_tcp_timestamps
;
62 extern int sysctl_tcp_window_scaling
;
63 extern int sysctl_tcp_syncookies
;
64 extern int sysctl_ip_dynaddr
;
66 /* Check TCP sequence numbers in ICMP packets. */
67 #define ICMP_PARANOIA 1
69 #define ICMP_MIN_LENGTH 4
71 #define ICMP_MIN_LENGTH 8
74 static void tcp_v4_send_reset(struct sk_buff
*skb
);
76 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
79 /* This is for sockets with full identity only. Sockets here will always
80 * be without wildcards and will have the following invariant:
81 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
83 * First half of the table is for sockets not in TIME_WAIT, second half
84 * is for TIME_WAIT sockets only.
86 struct sock
*tcp_established_hash
[TCP_HTABLE_SIZE
];
88 /* All sockets in TCP_LISTEN state will be in here. This is the only table
89 * where wildcard'd TCP sockets can exist. Hash function here is just local
92 struct sock
*tcp_listening_hash
[TCP_LHTABLE_SIZE
];
94 /* Ok, let's try this, I give up, we do need a local binding
95 * TCP hash as well as the others for fast bind/connect.
97 struct sock
*tcp_bound_hash
[TCP_BHTABLE_SIZE
];
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range
[2] = { 1024, 4999 };
106 static __inline__
int tcp_hashfn(__u32 laddr
, __u16 lport
,
107 __u32 faddr
, __u16 fport
)
109 return ((laddr
^ lport
) ^ (faddr
^ fport
)) & ((TCP_HTABLE_SIZE
/2) - 1);
112 static __inline__
int tcp_sk_hashfn(struct sock
*sk
)
114 __u32 laddr
= sk
->rcv_saddr
;
115 __u16 lport
= sk
->num
;
116 __u32 faddr
= sk
->daddr
;
117 __u16 fport
= sk
->dummy_th
.dest
;
119 return tcp_hashfn(laddr
, lport
, faddr
, fport
);
122 static int tcp_v4_verify_bind(struct sock
*sk
, unsigned short snum
)
125 int retval
= 0, sk_reuse
= sk
->reuse
;
128 sk2
= tcp_bound_hash
[tcp_bhashfn(snum
)];
129 for(; sk2
!= NULL
; sk2
= sk2
->bind_next
) {
130 if((sk2
->num
== snum
) && (sk2
!= sk
)) {
131 unsigned char state
= sk2
->state
;
132 int sk2_reuse
= sk2
->reuse
;
134 /* Two sockets can be bound to the same port if they're
135 * bound to different interfaces.
138 if(sk
->bound_dev_if
!= sk2
->bound_dev_if
)
141 if(!sk2
->rcv_saddr
|| !sk
->rcv_saddr
) {
144 (state
== TCP_LISTEN
)) {
148 } else if(sk2
->rcv_saddr
== sk
->rcv_saddr
) {
151 (state
== TCP_LISTEN
)) {
163 static __inline__
int tcp_lport_inuse(int num
)
165 struct sock
*sk
= tcp_bound_hash
[tcp_bhashfn(num
)];
167 for(; sk
!= NULL
; sk
= sk
->bind_next
) {
174 /* Find a "good" local port, this is family independant.
175 * There are several strategies working in unison here to
176 * get the best possible performance. The current socket
177 * load is kept track of, if it is zero there is a strong
178 * likely hood that there is a zero length chain we will
179 * find with a small amount of searching, else the load is
180 * what we shoot for for when the chains all have at least
181 * one entry. The base helps us walk the chains in an
182 * order such that a good chain is found as quickly as possible. -DaveM
184 unsigned short tcp_good_socknum(void)
186 static int start
= 0;
187 static int binding_contour
= 0;
189 int size
= 32767; /* a big num. */
190 int retval
= 0, i
, end
, bc
;
193 if (start
> sysctl_local_port_range
[1] || start
< sysctl_local_port_range
[0])
194 start
= sysctl_local_port_range
[0];
195 i
= tcp_bhashfn(start
);
196 end
= i
+ TCP_BHTABLE_SIZE
;
197 bc
= binding_contour
;
199 struct sock
*sk
= tcp_bound_hash
[i
&(TCP_BHTABLE_SIZE
-1)];
201 /* find the smallest value no smaller than start
202 * that has this hash value.
204 retval
= tcp_bhashnext(start
-1,i
&(TCP_BHTABLE_SIZE
-1));
206 /* Check for decreasing load. */
212 do { sk
= sk
->bind_next
; } while (++j
< size
&& sk
);
214 best
= i
&(TCP_BHTABLE_SIZE
-1);
216 if (bc
&& size
<= bc
)
223 /* Socket load is increasing, adjust our load average. */
224 binding_contour
= size
;
226 if (size
< binding_contour
)
227 binding_contour
= size
;
229 retval
= tcp_bhashnext(start
-1,i
);
231 best
= retval
; /* mark the starting point to avoid infinite loops */
232 while(tcp_lport_inuse(retval
)) {
233 retval
= tcp_bhashnext(retval
,i
);
234 if (retval
> sysctl_local_port_range
[1]) /* Upper bound */
235 retval
= tcp_bhashnext(sysctl_local_port_range
[0],i
);
236 if (retval
== best
) {
237 /* This hash chain is full. No answer. */
244 start
= (retval
+ 1);
250 static void tcp_v4_hash(struct sock
*sk
)
256 if(state
!= TCP_CLOSE
|| !sk
->dead
) {
259 if(state
== TCP_LISTEN
)
260 skp
= &tcp_listening_hash
[tcp_sk_listen_hashfn(sk
)];
262 skp
= &tcp_established_hash
[tcp_sk_hashfn(sk
)];
264 if((sk
->next
= *skp
) != NULL
)
265 (*skp
)->pprev
= &sk
->next
;
273 static void tcp_v4_unhash(struct sock
*sk
)
278 sk
->next
->pprev
= sk
->pprev
;
279 *sk
->pprev
= sk
->next
;
281 tcp_sk_unbindify(sk
);
286 static void tcp_v4_rehash(struct sock
*sk
)
294 sk
->next
->pprev
= sk
->pprev
;
295 *sk
->pprev
= sk
->next
;
297 tcp_sk_unbindify(sk
);
299 if(state
!= TCP_CLOSE
|| !sk
->dead
) {
302 if(state
== TCP_LISTEN
) {
303 skp
= &tcp_listening_hash
[tcp_sk_listen_hashfn(sk
)];
305 int hash
= tcp_sk_hashfn(sk
);
306 if(state
== TCP_TIME_WAIT
)
307 hash
+= (TCP_HTABLE_SIZE
/2);
308 skp
= &tcp_established_hash
[hash
];
311 if((sk
->next
= *skp
) != NULL
)
312 (*skp
)->pprev
= &sk
->next
;
320 /* Don't inline this cruft. Here are some nice properties to
321 * exploit here. The BSD API does not allow a listening TCP
322 * to specify the remote port nor the remote address for the
323 * connection. So always assume those are both wildcarded
324 * during the search since they can never be otherwise.
326 static struct sock
*tcp_v4_lookup_listener(u32 daddr
, unsigned short hnum
, int dif
)
329 struct sock
*result
= NULL
;
333 for(sk
= tcp_listening_hash
[tcp_lhashfn(hnum
)]; sk
; sk
= sk
->next
) {
334 if(sk
->num
== hnum
) {
335 __u32 rcv_saddr
= sk
->rcv_saddr
;
339 if (rcv_saddr
!= daddr
)
343 if (sk
->bound_dev_if
) {
344 if (sk
->bound_dev_if
!= dif
)
350 if (score
> hiscore
) {
359 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
360 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
362 static inline struct sock
*__tcp_v4_lookup(struct tcphdr
*th
,
363 u32 saddr
, u16 sport
, u32 daddr
, u16 dport
, int dif
)
365 unsigned short hnum
= ntohs(dport
);
367 int hash
= tcp_hashfn(daddr
, hnum
, saddr
, sport
);
369 /* Optimize here for direct hit, only listening connections can
370 * have wildcards anyways. It is assumed that this code only
371 * gets called from within NET_BH.
373 for(sk
= tcp_established_hash
[hash
]; sk
; sk
= sk
->next
)
374 if(sk
->daddr
== saddr
&& /* remote address */
375 sk
->dummy_th
.dest
== sport
&& /* remote port */
376 sk
->num
== hnum
&& /* local port */
377 sk
->rcv_saddr
== daddr
&& /* local address */
378 (!sk
->bound_dev_if
|| sk
->bound_dev_if
== dif
))
379 goto hit
; /* You sunk my battleship! */
381 /* Must check for a TIME_WAIT'er before going to listener hash. */
382 for(sk
= tcp_established_hash
[hash
+(TCP_HTABLE_SIZE
/2)]; sk
; sk
= sk
->next
)
383 if(sk
->daddr
== saddr
&& /* remote address */
384 sk
->dummy_th
.dest
== sport
&& /* remote port */
385 sk
->num
== hnum
&& /* local port */
386 sk
->rcv_saddr
== daddr
&& /* local address */
387 (!sk
->bound_dev_if
|| sk
->bound_dev_if
== dif
))
390 sk
= tcp_v4_lookup_listener(daddr
, hnum
, dif
);
395 __inline__
struct sock
*tcp_v4_lookup(u32 saddr
, u16 sport
, u32 daddr
, u16 dport
, int dif
)
397 return __tcp_v4_lookup(0, saddr
, sport
, daddr
, dport
, dif
);
400 #ifdef CONFIG_IP_TRANSPARENT_PROXY
401 #define secondlist(hpnum, sk, fpass) \
402 ({ struct sock *s1; if(!(sk) && (fpass)--) \
403 s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
409 #define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
410 secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
412 #define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
413 secondlist((hpnum),(sk)->bind_next,(fpass))
415 static struct sock
*tcp_v4_proxy_lookup(unsigned short num
, unsigned long raddr
,
416 unsigned short rnum
, unsigned long laddr
,
417 struct device
*dev
, unsigned short pnum
,
420 struct sock
*s
, *result
= NULL
;
423 unsigned short hnum
= ntohs(num
);
424 unsigned short hpnum
= ntohs(pnum
);
427 if(dev
&& dev
->ip_ptr
) {
428 struct in_device
*idev
= dev
->ip_ptr
;
431 paddr
= idev
->ifa_list
->ifa_local
;
434 /* This code must run only from NET_BH. */
435 for(s
= tcp_v4_proxy_loop_init(hnum
, hpnum
, s
, firstpass
);
437 s
= tcp_v4_proxy_loop_next(hnum
, hpnum
, s
, firstpass
)) {
438 if(s
->num
== hnum
|| s
->num
== hpnum
) {
440 if(s
->dead
&& (s
->state
== TCP_CLOSE
))
443 if((s
->num
!= hpnum
|| s
->rcv_saddr
!= paddr
) &&
444 (s
->num
!= hnum
|| s
->rcv_saddr
!= laddr
))
449 if(s
->daddr
!= raddr
)
453 if(s
->dummy_th
.dest
) {
454 if(s
->dummy_th
.dest
!= rnum
)
458 if(s
->bound_dev_if
) {
459 if(s
->bound_dev_if
!= dif
)
463 if(score
== 4 && s
->num
== hnum
) {
466 } else if(score
> badness
&& (s
->num
== hpnum
|| s
->rcv_saddr
)) {
476 #undef tcp_v4_proxy_loop_init
477 #undef tcp_v4_proxy_loop_next
481 static inline __u32
tcp_v4_init_sequence(struct sock
*sk
, struct sk_buff
*skb
)
483 return secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
493 * Check that a TCP address is unique, don't allow multiple
494 * connects to/from the same address
497 static int tcp_unique_address(u32 saddr
, u16 snum
, u32 daddr
, u16 dnum
)
499 int retval
= 1, hashent
= tcp_hashfn(saddr
, snum
, daddr
, dnum
);
502 /* Make sure we are allowed to connect here.
503 * But freeze the hash while we snoop around.
506 sk
= tcp_established_hash
[hashent
];
507 for (; sk
!= NULL
; sk
= sk
->next
) {
508 if(sk
->daddr
== daddr
&& /* remote address */
509 sk
->dummy_th
.dest
== dnum
&& /* remote port */
510 sk
->num
== snum
&& /* local port */
511 sk
->saddr
== saddr
) { /* local address */
517 /* Must check TIME_WAIT'ers too. */
518 sk
= tcp_established_hash
[hashent
+ (TCP_HTABLE_SIZE
/2)];
519 for (; sk
!= NULL
; sk
= sk
->next
) {
520 if(sk
->daddr
== daddr
&& /* remote address */
521 sk
->dummy_th
.dest
== dnum
&& /* remote port */
522 sk
->num
== snum
&& /* local port */
523 sk
->saddr
== saddr
) { /* local address */
535 * This will initiate an outgoing connection.
538 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
540 struct sk_buff
*buff
;
544 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
545 struct sockaddr_in
*usin
= (struct sockaddr_in
*) uaddr
;
547 if (sk
->state
!= TCP_CLOSE
)
550 /* Don't allow a double connect. */
554 if (addr_len
< sizeof(struct sockaddr_in
))
557 if (usin
->sin_family
!= AF_INET
) {
558 static int complained
;
559 if (usin
->sin_family
)
560 return(-EAFNOSUPPORT
);
562 printk(KERN_DEBUG
"%s forgot to set AF_INET in " __FUNCTION__
"\n", current
->comm
);
566 dst_release(sk
->dst_cache
);
567 sk
->dst_cache
= NULL
;
570 tmp
= ip_route_connect(&rt
, usin
->sin_addr
.s_addr
, sk
->saddr
,
571 RT_TOS(sk
->ip_tos
)|(sk
->localroute
|| 0), sk
->bound_dev_if
);
575 if (rt
->rt_flags
&(RTCF_MULTICAST
|RTCF_BROADCAST
)) {
580 if (!tcp_unique_address(rt
->rt_src
, sk
->num
, rt
->rt_dst
,
583 return -EADDRNOTAVAIL
;
588 /* Do this early, so there is less state to unwind on failure. */
589 buff
= sock_wmalloc(sk
, MAX_SYN_SIZE
, 0, GFP_KERNEL
);
596 sk
->dst_cache
= &rt
->u
.dst
;
597 sk
->daddr
= rt
->rt_dst
;
599 sk
->saddr
= rt
->rt_src
;
600 sk
->rcv_saddr
= sk
->saddr
;
602 if (sk
->priority
== 0)
603 sk
->priority
= rt
->u
.dst
.priority
;
605 sk
->dummy_th
.dest
= usin
->sin_port
;
607 sk
->write_seq
= secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
613 tp
->snd_wl2
= sk
->write_seq
;
614 tp
->snd_una
= sk
->write_seq
;
620 /* Put in the IP header and routing stuff. */
621 tmp
= ip_build_header(buff
, sk
);
623 /* Caller has done ip_rt_put(rt) and set sk->dst_cache
624 * to NULL. We must unwind the half built TCP socket
625 * state so that this failure does not create a "stillborn"
626 * sock (ie. future re-tries of connect() would fail).
629 sk
->saddr
= sk
->rcv_saddr
= 0;
630 kfree_skb(buff
, FREE_WRITE
);
632 return(-ENETUNREACH
);
635 /* No failure conditions can result past this point. */
637 th
= (struct tcphdr
*) skb_put(buff
,sizeof(struct tcphdr
));
640 memcpy(th
,(void *)&(sk
->dummy_th
), sizeof(*th
));
641 buff
->seq
= sk
->write_seq
++;
642 th
->seq
= htonl(buff
->seq
);
643 tp
->snd_nxt
= sk
->write_seq
;
644 buff
->end_seq
= sk
->write_seq
;
648 sk
->mtu
= rt
->u
.dst
.pmtu
;
649 if ((sk
->ip_pmtudisc
== IP_PMTUDISC_DONT
||
650 (sk
->ip_pmtudisc
== IP_PMTUDISC_WANT
&&
651 rt
->rt_flags
&RTCF_NOPMTUDISC
)) &&
652 rt
->u
.dst
.pmtu
> 576)
656 sk
->mtu
= 64; /* Sanity limit */
659 sk
->mss
= sk
->user_mss
;
661 sk
->mss
= (sk
->mtu
- sizeof(struct iphdr
) -
662 sizeof(struct tcphdr
));
665 printk(KERN_DEBUG
"intial sk->mss below 1\n");
666 sk
->mss
= 1; /* Sanity limit */
669 tp
->window_clamp
= rt
->u
.dst
.window
;
670 tcp_select_initial_window(sock_rspace(sk
)/2,sk
->mss
,
673 sysctl_tcp_window_scaling
,
675 th
->window
= htons(tp
->rcv_wnd
);
677 tmp
= tcp_syn_build_options(buff
, sk
->mss
, sysctl_tcp_sack
,
678 sysctl_tcp_timestamps
,
679 sysctl_tcp_window_scaling
,tp
->rcv_wscale
);
681 th
->doff
= (sizeof(*th
)+ tmp
)>>2;
683 tcp_v4_send_check(sk
, th
, sizeof(struct tcphdr
) + tmp
, buff
);
685 tcp_set_state(sk
,TCP_SYN_SENT
);
687 /* Socket identity change complete, no longer
688 * in TCP_CLOSE, so rehash.
692 tp
->rto
= rt
->u
.dst
.rtt
;
694 tcp_init_xmit_timers(sk
);
696 /* Now works the right way instead of a hacked initial setting. */
699 skb_queue_tail(&sk
->write_queue
, buff
);
702 buff
->when
= jiffies
;
704 ip_queue_xmit(skb_clone(buff
, GFP_KERNEL
));
706 /* Timer for repeating the SYN until an answer. */
707 tcp_reset_xmit_timer(sk
, TIME_RETRANS
, tp
->rto
);
708 tcp_statistics
.TcpActiveOpens
++;
709 tcp_statistics
.TcpOutSegs
++;
715 static int tcp_v4_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
717 int retval
= -EINVAL
;
719 /* Do sanity checking for sendmsg/sendto/send. */
720 if (msg
->msg_flags
& ~(MSG_OOB
|MSG_DONTROUTE
|MSG_DONTWAIT
))
723 struct sockaddr_in
*addr
=(struct sockaddr_in
*)msg
->msg_name
;
725 if (msg
->msg_namelen
< sizeof(*addr
))
727 if (addr
->sin_family
&& addr
->sin_family
!= AF_INET
)
730 if(sk
->state
== TCP_CLOSE
)
733 if (addr
->sin_port
!= sk
->dummy_th
.dest
)
735 if (addr
->sin_addr
.s_addr
!= sk
->daddr
)
740 retval
= tcp_do_sendmsg(sk
, msg
->msg_iovlen
, msg
->msg_iov
,
751 * Do a linear search in the socket open_request list.
752 * This should be replaced with a global hash table.
754 static struct open_request
*tcp_v4_search_req(struct tcp_opt
*tp
,
757 struct open_request
**prevp
)
759 struct open_request
*req
, *prev
;
760 __u16 rport
= th
->source
;
762 /* assumption: the socket is not in use.
763 * as we checked the user count on tcp_rcv and we're
764 * running from a soft interrupt.
766 prev
= (struct open_request
*) (&tp
->syn_wait_queue
);
767 for (req
= prev
->dl_next
; req
; req
= req
->dl_next
) {
768 if (req
->af
.v4_req
.rmt_addr
== iph
->saddr
&&
769 req
->af
.v4_req
.loc_addr
== iph
->daddr
&&
770 req
->rmt_port
== rport
) {
781 * This routine does path mtu discovery as defined in RFC1197.
783 static inline void do_pmtu_discovery(struct sock
*sk
, struct iphdr
*ip
)
786 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
788 /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
789 * send out by Linux are always <576bytes so they should go through
792 if (sk
->state
== TCP_LISTEN
)
795 /* We don't check in the destentry if pmtu discovery is forbidden
796 * on this route. We just assume that no packet_to_big packets
797 * are send back when pmtu discovery is not active.
798 * There is a small race when the user changes this flag in the
799 * route, but I think that's acceptable.
801 if (sk
->ip_pmtudisc
!= IP_PMTUDISC_DONT
&& sk
->dst_cache
) {
802 new_mtu
= sk
->dst_cache
->pmtu
-
803 (ip
->ihl
<<2) - tp
->tcp_header_len
;
804 if (new_mtu
< sk
->mss
&& new_mtu
> 0) {
806 /* Resend the TCP packet because it's
807 * clear that the old packet has been
808 * dropped. This is the new "fast" path mtu
811 if (!sk
->sock_readers
)
812 tcp_simple_retransmit(sk
);
818 * This routine is called by the ICMP module when it gets some
819 * sort of error condition. If err < 0 then the socket should
820 * be closed and the error returned to the user. If err > 0
821 * it's just the icmp type << 8 | icmp code. After adjustment
822 * header points to the first 8 bytes of the tcp header. We need
823 * to find the appropriate port.
826 void tcp_v4_err(struct sk_buff
*skb
, unsigned char *dp
, int len
)
828 struct iphdr
*iph
= (struct iphdr
*)dp
;
831 int type
= skb
->h
.icmph
->type
;
832 int code
= skb
->h
.icmph
->code
;
839 if (len
< (iph
->ihl
<< 2) + ICMP_MIN_LENGTH
) {
840 icmp_statistics
.IcmpInErrors
++;
844 th
= (struct tcphdr
*)(dp
+(iph
->ihl
<<2));
846 sk
= tcp_v4_lookup(iph
->daddr
, th
->dest
, iph
->saddr
, th
->source
, skb
->dev
->ifindex
);
848 icmp_statistics
.IcmpInErrors
++;
852 tp
= &sk
->tp_pinfo
.af_tcp
;
854 seq
= ntohl(th
->seq
);
855 if (sk
->state
!= TCP_LISTEN
&&
856 !between(seq
, tp
->snd_una
, max(tp
->snd_una
+32768,tp
->snd_nxt
))) {
858 printk(KERN_DEBUG
"icmp packet outside the tcp window:"
860 (int)sk
->state
, seq
, tp
->snd_una
, tp
->snd_nxt
);
866 case ICMP_SOURCE_QUENCH
:
867 tp
->snd_ssthresh
= max(tp
->snd_cwnd
>> 1, 2);
868 tp
->snd_cwnd
= tp
->snd_ssthresh
;
869 tp
->high_seq
= tp
->snd_nxt
;
871 case ICMP_PARAMETERPROB
:
873 sk
->error_report(sk
);
875 case ICMP_DEST_UNREACH
:
876 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
877 do_pmtu_discovery(sk
, iph
);
883 /* If we've already connected we will keep trying
884 * until we time out, or the user gives up.
886 if (code
> NR_ICMP_UNREACH
)
891 struct open_request
*req
, *prev
;
893 /* Prevent race conditions with accept() -
894 * ICMP is unreliable.
896 if (sk
->sock_readers
) {
897 /* XXX: add a counter here to profile this.
898 * If too many ICMPs get dropped on busy
899 * servers this needs to be solved differently.
904 if (!th
->syn
&& !th
->ack
)
906 req
= tcp_v4_search_req(tp
, iph
, th
, &prev
);
910 if (seq
!= req
->snt_isn
) {
912 printk(KERN_DEBUG
"icmp packet for openreq "
913 "with wrong seq number:%d:%d\n",
918 if (req
->sk
) { /* not yet accept()ed */
919 sk
= req
->sk
; /* report error in accept */
921 tcp_synq_unlink(tp
, req
, prev
);
922 req
->class->destructor(req
);
923 tcp_openreq_free(req
);
932 if(icmp_err_convert
[code
].fatal
|| opening
) {
933 sk
->err
= icmp_err_convert
[code
].errno
;
935 tcp_statistics
.TcpAttemptFails
++;
936 if (sk
->state
!= TCP_LISTEN
)
937 tcp_set_state(sk
,TCP_CLOSE
);
938 sk
->error_report(sk
); /* Wake people up to see the error (see connect in sock.c) */
940 } else /* Only an error on timeout */
941 sk
->err_soft
= icmp_err_convert
[code
].errno
;
944 /* This routine computes an IPv4 TCP checksum. */
945 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
949 th
->check
= tcp_v4_check(th
, len
, sk
->saddr
, sk
->daddr
,
950 csum_partial((char *)th
, th
->doff
<<2, skb
->csum
));
954 * This routine will send an RST to the other tcp.
956 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
958 * Answer: if a packet caused RST, it is not for a socket
959 * existing in our system, if it is matched to a socket,
960 * it is just duplicate segment or bug in other side's TCP.
961 * So that we build reply only basing on parameters
962 * arrived with segment.
963 * Exception: precedence violation. We do not implement it in any case.
966 static void tcp_v4_send_reset(struct sk_buff
*skb
)
968 struct tcphdr
*th
= skb
->h
.th
;
969 struct sk_buff
*skb1
;
975 skb1
= ip_reply(skb
, sizeof(struct tcphdr
));
979 skb1
->h
.th
= th1
= (struct tcphdr
*)skb_put(skb1
, sizeof(struct tcphdr
));
980 memset(th1
, 0, sizeof(*th1
));
982 /* Swap the send and the receive. */
983 th1
->dest
= th
->source
;
984 th1
->source
= th
->dest
;
985 th1
->doff
= sizeof(*th1
)/4;
989 th1
->seq
= th
->ack_seq
;
993 th1
->ack_seq
= th
->seq
;
995 th1
->ack_seq
= htonl(ntohl(th
->seq
)+1);
998 skb1
->csum
= csum_partial((u8
*) th1
, sizeof(*th1
), 0);
999 th1
->check
= tcp_v4_check(th1
, sizeof(*th1
), skb1
->nh
.iph
->saddr
,
1000 skb1
->nh
.iph
->daddr
, skb1
->csum
);
1001 /* FIXME: should this carry an options packet? */
1002 ip_queue_xmit(skb1
);
1003 tcp_statistics
.TcpOutSegs
++;
1004 tcp_statistics
.TcpOutRsts
++;
1007 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1009 * Check whether a received TCP packet might be for one of our
1013 int tcp_chkaddr(struct sk_buff
*skb
)
1015 struct iphdr
*iph
= skb
->nh
.iph
;
1016 struct tcphdr
*th
= (struct tcphdr
*)(skb
->nh
.raw
+ iph
->ihl
*4);
1019 sk
= tcp_v4_lookup(iph
->saddr
, th
->source
, iph
->daddr
, th
->dest
, skb
->dev
->ifindex
);
1024 /* 0 means accept all LOCAL addresses here, not all the world... */
1026 if (sk
->rcv_saddr
== 0)
1033 static void tcp_v4_send_synack(struct sock
*sk
, struct open_request
*req
)
1035 struct sk_buff
* skb
;
1040 skb
= sock_wmalloc(sk
, MAX_SYN_SIZE
, 1, GFP_ATOMIC
);
1044 if(ip_build_pkt(skb
, sk
, req
->af
.v4_req
.loc_addr
,
1045 req
->af
.v4_req
.rmt_addr
, req
->af
.v4_req
.opt
) < 0) {
1046 kfree_skb(skb
, FREE_WRITE
);
1050 mss
= (skb
->dst
->pmtu
- sizeof(struct iphdr
) - sizeof(struct tcphdr
));
1052 mss
= min(mss
, sk
->user_mss
);
1053 skb
->h
.th
= th
= (struct tcphdr
*) skb_put(skb
, sizeof(struct tcphdr
));
1055 /* Don't offer more than they did.
1056 * This way we don't have to memorize who said what.
1057 * FIXME: maybe this should be changed for better performance
1060 req
->mss
= min(mss
, req
->mss
);
1063 printk(KERN_DEBUG
"initial req->mss below 1\n");
1067 /* Yuck, make this header setup more efficient... -DaveM */
1068 memset(th
, 0, sizeof(struct tcphdr
));
1071 th
->source
= sk
->dummy_th
.source
;
1072 th
->dest
= req
->rmt_port
;
1073 skb
->seq
= req
->snt_isn
;
1074 skb
->end_seq
= skb
->seq
+ 1;
1075 th
->seq
= htonl(skb
->seq
);
1076 th
->ack_seq
= htonl(req
->rcv_isn
+ 1);
1077 if (req
->rcv_wnd
== 0) { /* ignored for retransmitted syns */
1079 /* Set this up on the first call only */
1080 req
->window_clamp
= skb
->dst
->window
;
1081 tcp_select_initial_window(sock_rspace(sk
)/2,req
->mss
,
1086 req
->rcv_wscale
= rcv_wscale
;
1088 th
->window
= htons(req
->rcv_wnd
);
1090 /* XXX Partial csum of 4 byte quantity is itself! -DaveM
1091 * Yes, but it's a bit harder to special case now. It's
1092 * now computed inside the tcp_v4_send_check() to clean up
1093 * updating the options fields in the mainline send code.
1094 * If someone thinks this is really bad let me know and
1095 * I'll try to do it a different way. -- erics
1098 tmp
= tcp_syn_build_options(skb
, req
->mss
, req
->sack_ok
, req
->tstamp_ok
,
1099 req
->wscale_ok
,req
->rcv_wscale
);
1101 th
->doff
= (sizeof(*th
) + tmp
)>>2;
1102 th
->check
= tcp_v4_check(th
, sizeof(*th
) + tmp
,
1103 req
->af
.v4_req
.loc_addr
, req
->af
.v4_req
.rmt_addr
,
1104 csum_partial((char *)th
, sizeof(*th
)+tmp
, skb
->csum
));
1107 tcp_statistics
.TcpOutSegs
++;
1110 static void tcp_v4_or_free(struct open_request
*req
)
1112 if(!req
->sk
&& req
->af
.v4_req
.opt
)
1113 kfree_s(req
->af
.v4_req
.opt
,
1114 sizeof(struct ip_options
) + req
->af
.v4_req
.opt
->optlen
);
1117 static inline void syn_flood_warning(struct sk_buff
*skb
)
1119 static unsigned long warntime
;
1121 if (jiffies
- warntime
> HZ
*60) {
1124 "possible SYN flooding on port %d. Sending cookies.\n",
1125 ntohs(skb
->h
.th
->dest
));
1129 int sysctl_max_syn_backlog
= 1024;
1130 int sysctl_tcp_syn_taildrop
= 1;
1132 struct or_calltable or_ipv4
= {
1139 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1140 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1142 #define BACKLOG(sk) ((sk)->ack_backlog)
1143 #define BACKLOGMAX(sk) ((sk)->max_ack_backlog)
1146 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
, void *ptr
,
1149 struct ip_options
*opt
= (struct ip_options
*) ptr
;
1151 struct open_request
*req
;
1152 struct tcphdr
*th
= skb
->h
.th
;
1153 __u32 saddr
= skb
->nh
.iph
->saddr
;
1154 __u32 daddr
= skb
->nh
.iph
->daddr
;
1155 #ifdef CONFIG_SYN_COOKIES
1156 int want_cookie
= 0;
1158 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1161 /* If the socket is dead, don't accept the connection. */
1165 /* XXX: Check against a global syn pool counter. */
1166 if (BACKLOG(sk
) > BACKLOGMAX(sk
)) {
1167 #ifdef CONFIG_SYN_COOKIES
1168 if (sysctl_tcp_syncookies
) {
1169 syn_flood_warning(skb
);
1173 if (sysctl_tcp_syn_taildrop
) {
1174 struct open_request
*req
;
1176 req
= tcp_synq_unlink_tail(&sk
->tp_pinfo
.af_tcp
);
1177 tcp_openreq_free(req
);
1178 tcp_statistics
.TcpAttemptFails
++;
1184 isn
= tcp_v4_init_sequence(sk
, skb
);
1188 req
= tcp_openreq_alloc();
1190 if (!want_cookie
) BACKLOG(sk
)--;
1194 req
->rcv_wnd
= 0; /* So that tcp_send_synack() knows! */
1196 req
->rcv_isn
= skb
->seq
;
1197 tp
.tstamp_ok
= tp
.sack_ok
= tp
.wscale_ok
= tp
.snd_wscale
= 0;
1199 tcp_parse_options(th
,&tp
,want_cookie
);
1201 req
->ts_recent
= tp
.rcv_tsval
;
1202 req
->mss
= tp
.in_mss
;
1203 req
->tstamp_ok
= tp
.tstamp_ok
;
1204 req
->sack_ok
= tp
.sack_ok
;
1205 req
->snd_wscale
= tp
.snd_wscale
;
1206 req
->wscale_ok
= tp
.wscale_ok
;
1207 req
->rmt_port
= th
->source
;
1208 req
->af
.v4_req
.loc_addr
= daddr
;
1209 req
->af
.v4_req
.rmt_addr
= saddr
;
1211 /* Note that we ignore the isn passed from the TIME_WAIT
1212 * state here. That's the price we pay for cookies.
1215 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1220 req
->af
.v4_req
.opt
= NULL
;
1222 if (opt
&& opt
->optlen
) {
1223 int opt_size
= sizeof(struct ip_options
) + opt
->optlen
;
1225 req
->af
.v4_req
.opt
= kmalloc(opt_size
, GFP_ATOMIC
);
1226 if (req
->af
.v4_req
.opt
) {
1227 if (ip_options_echo(req
->af
.v4_req
.opt
, skb
)) {
1228 kfree_s(req
->af
.v4_req
.opt
, opt_size
);
1229 req
->af
.v4_req
.opt
= NULL
;
1233 req
->class = &or_ipv4
;
1237 tcp_v4_send_synack(sk
, req
);
1240 if (req
->af
.v4_req
.opt
)
1241 kfree(req
->af
.v4_req
.opt
);
1242 tcp_openreq_free(req
);
1244 req
->expires
= jiffies
+ TCP_TIMEOUT_INIT
;
1245 tcp_inc_slow_timer(TCP_SLT_SYNACK
);
1246 tcp_synq_queue(&sk
->tp_pinfo
.af_tcp
, req
);
1249 sk
->data_ready(sk
, 0);
1254 SOCK_DEBUG(sk
, "Reset on %p: Connect on dead socket.\n",sk
);
1255 tcp_statistics
.TcpAttemptFails
++;
1258 tcp_statistics
.TcpAttemptFails
++;
1262 struct sock
* tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1263 struct open_request
*req
,
1264 struct dst_entry
*dst
)
1266 struct tcp_opt
*newtp
;
1271 if (sk
->ack_backlog
> sk
->max_ack_backlog
)
1272 goto exit
; /* head drop */
1274 newsk
= sk_alloc(AF_INET
, GFP_ATOMIC
);
1280 memcpy(newsk
, sk
, sizeof(*newsk
));
1282 /* Or else we die! -DaveM */
1283 newsk
->sklist_next
= NULL
;
1285 newsk
->opt
= req
->af
.v4_req
.opt
;
1287 skb_queue_head_init(&newsk
->write_queue
);
1288 skb_queue_head_init(&newsk
->receive_queue
);
1289 skb_queue_head_init(&newsk
->out_of_order_queue
);
1290 skb_queue_head_init(&newsk
->error_queue
);
1293 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
1294 newtp
->send_head
= NULL
;
1295 newtp
->retrans_head
= NULL
;
1299 skb_queue_head_init(&newsk
->back_log
);
1301 newsk
->prot
->init(newsk
);
1303 newtp
->snd_cwnd_cnt
= 0;
1308 atomic_set(&newsk
->wmem_alloc
, 0);
1309 atomic_set(&newsk
->rmem_alloc
, 0);
1310 newsk
->localroute
= sk
->localroute
;
1313 newsk
->shutdown
= 0;
1314 newsk
->ack_backlog
= 0;
1316 newtp
->fin_seq
= req
->rcv_isn
;
1317 newsk
->syn_seq
= req
->rcv_isn
;
1318 newsk
->state
= TCP_SYN_RECV
;
1321 newsk
->write_seq
= req
->snt_isn
;
1323 newtp
->snd_wnd
= ntohs(skb
->h
.th
->window
);
1324 newtp
->max_window
= newtp
->snd_wnd
;
1325 newtp
->snd_wl1
= req
->rcv_isn
;
1326 newtp
->snd_wl2
= newsk
->write_seq
;
1327 newtp
->snd_una
= newsk
->write_seq
++;
1328 newtp
->snd_nxt
= newsk
->write_seq
;
1330 newsk
->urg_data
= 0;
1331 newtp
->packets_out
= 0;
1332 newtp
->retransmits
= 0;
1335 init_timer(&newsk
->timer
);
1336 newsk
->timer
.data
= (unsigned long) newsk
;
1337 newsk
->timer
.function
= &net_timer
;
1339 tcp_init_xmit_timers(newsk
);
1341 newsk
->dummy_th
.source
= sk
->dummy_th
.source
;
1342 newsk
->dummy_th
.dest
= req
->rmt_port
;
1343 newsk
->sock_readers
=0;
1345 newtp
->last_ack_sent
= newtp
->rcv_nxt
= req
->rcv_isn
+ 1;
1346 newtp
->rcv_wup
= req
->rcv_isn
+ 1;
1347 newsk
->copied_seq
= req
->rcv_isn
+ 1;
1349 newsk
->socket
= NULL
;
1351 newsk
->daddr
= req
->af
.v4_req
.rmt_addr
;
1352 newsk
->saddr
= req
->af
.v4_req
.loc_addr
;
1353 newsk
->rcv_saddr
= req
->af
.v4_req
.loc_addr
;
1355 /* options / mss / route_cache */
1359 if (ip_route_output(&rt
,
1360 newsk
->opt
&& newsk
->opt
->srr
?
1361 newsk
->opt
->faddr
: newsk
->daddr
,
1362 newsk
->saddr
, newsk
->ip_tos
, 0)) {
1368 newsk
->dst_cache
= dst
;
1370 snd_mss
= dst
->pmtu
;
1372 /* FIXME: is mtu really the same as snd_mss? */
1373 newsk
->mtu
= snd_mss
;
1374 /* FIXME: where does mtu get used after this? */
1376 if (newsk
->mtu
< 64)
1379 newtp
->sack_ok
= req
->sack_ok
;
1380 newtp
->tstamp_ok
= req
->tstamp_ok
;
1381 newtp
->window_clamp
= req
->window_clamp
;
1382 newtp
->rcv_wnd
= req
->rcv_wnd
;
1383 newtp
->wscale_ok
= req
->wscale_ok
;
1384 if (newtp
->wscale_ok
) {
1385 newtp
->snd_wscale
= req
->snd_wscale
;
1386 newtp
->rcv_wscale
= req
->rcv_wscale
;
1388 newtp
->snd_wscale
= newtp
->rcv_wscale
= 0;
1389 newtp
->window_clamp
= min(newtp
->window_clamp
,65535);
1391 if (newtp
->tstamp_ok
) {
1392 newtp
->ts_recent
= req
->ts_recent
;
1393 newtp
->ts_recent_stamp
= jiffies
;
1394 newtp
->tcp_header_len
= sizeof(struct tcphdr
) + 12; /* FIXME: define constant! */
1395 newsk
->dummy_th
.doff
+= 3;
1397 newtp
->tcp_header_len
= sizeof(struct tcphdr
);
1400 snd_mss
-= sizeof(struct iphdr
) + sizeof(struct tcphdr
);
1402 snd_mss
= min(snd_mss
, sk
->user_mss
);
1404 /* Make sure our mtu is adjusted for headers. */
1405 newsk
->mss
= min(req
->mss
, snd_mss
) + sizeof(struct tcphdr
) - newtp
->tcp_header_len
;
1408 add_to_prot_sklist(newsk
);
1417 static void tcp_v4_rst_req(struct sock
*sk
, struct sk_buff
*skb
)
1419 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1420 struct open_request
*req
, *prev
;
1422 req
= tcp_v4_search_req(tp
,skb
->nh
.iph
, skb
->h
.th
, &prev
);
1425 /* Sequence number check required by RFC793 */
1426 if (before(skb
->seq
, req
->snt_isn
) || after(skb
->seq
, req
->snt_isn
+1))
1428 tcp_synq_unlink(tp
, req
, prev
);
1429 req
->class->destructor(req
);
1430 tcp_openreq_free(req
);
1433 /* Check for embryonic sockets (open_requests) We check packets with
1434 * only the SYN bit set against the open_request queue too: This
1435 * increases connection latency a bit, but is required to detect
1436 * retransmitted SYNs.
1438 static inline struct sock
*tcp_v4_hnd_req(struct sock
*sk
,struct sk_buff
*skb
)
1440 struct tcphdr
*th
= skb
->h
.th
;
1441 u32 flg
= ((u32
*)th
)[3];
1444 if (flg
& __constant_htonl(0x00040000)) {
1445 tcp_v4_rst_req(sk
, skb
);
1449 /* Check for SYN|ACK */
1450 if (flg
& __constant_htonl(0x00120000)) {
1451 struct open_request
*req
, *dummy
;
1452 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1454 /* Find possible connection requests. */
1455 req
= tcp_v4_search_req(tp
, skb
->nh
.iph
, th
, &dummy
);
1457 sk
= tcp_check_req(sk
, skb
, req
);
1459 #ifdef CONFIG_SYN_COOKIES
1461 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1468 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1470 #ifdef CONFIG_FILTER
1473 if (sk_filter(skb
, sk
->filter_data
, sk
->filter
))
1474 return -EPERM
; /* Toss packet */
1476 #endif /* CONFIG_FILTER */
1478 skb_set_owner_r(skb
, sk
);
1481 * socket locking is here for SMP purposes as backlog rcv
1482 * is currently called with bh processing disabled.
1486 if (sk
->state
== TCP_ESTABLISHED
) { /* Fast path */
1487 if (tcp_rcv_established(sk
, skb
, skb
->h
.th
, skb
->len
))
1494 if (sk
->state
== TCP_LISTEN
) {
1497 nsk
= tcp_v4_hnd_req(sk
, skb
);
1505 if (tcp_rcv_state_process(sk
, skb
, skb
->h
.th
,
1506 &(IPCB(skb
)->opt
), skb
->len
))
1512 tcp_v4_send_reset(skb
);
1514 kfree_skb(skb
, FREE_READ
);
1515 /* Be careful here. If this function gets more complicated and
1516 * gcc suffers from register pressure on the x86, sk (in %ebx)
1517 * might be destroyed here. This current version compiles correctly,
1518 * but you have been warned.
1528 int tcp_v4_rcv(struct sk_buff
*skb
, unsigned short len
)
1533 if (skb
->pkt_type
!=PACKET_HOST
)
1538 /* Pull up the IP header. */
1539 __skb_pull(skb
, skb
->h
.raw
- skb
->data
);
1541 /* Count it even if it's bad */
1542 tcp_statistics
.TcpInSegs
++;
1544 /* Try to use the device checksum if provided. */
1545 switch (skb
->ip_summed
) {
1547 skb
->csum
= csum_partial((char *)th
, len
, 0);
1549 if (tcp_v4_check(th
,len
,skb
->nh
.iph
->saddr
,skb
->nh
.iph
->daddr
,skb
->csum
)) {
1550 printk(KERN_DEBUG
"TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n",
1551 NIPQUAD(skb
->nh
.iph
->saddr
), ntohs(th
->source
), NIPQUAD(skb
->nh
.iph
->daddr
),
1552 ntohs(th
->dest
), len
, skb
->len
, ntohs(skb
->nh
.iph
->tot_len
));
1553 tcp_statistics
.TcpInErrs
++;
1557 /* CHECKSUM_UNNECESSARY */
1560 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1561 if (IPCB(skb
)->redirport
)
1562 sk
= tcp_v4_proxy_lookup(th
->dest
, skb
->nh
.iph
->saddr
, th
->source
,
1563 skb
->nh
.iph
->daddr
, skb
->dev
,
1564 IPCB(skb
)->redirport
, skb
->dev
->ifindex
);
1567 sk
= __tcp_v4_lookup(th
, skb
->nh
.iph
->saddr
, th
->source
,
1568 skb
->nh
.iph
->daddr
, th
->dest
, skb
->dev
->ifindex
);
1571 if(!ipsec_sk_policy(sk
,skb
))
1574 skb
->seq
= ntohl(th
->seq
);
1575 skb
->end_seq
= skb
->seq
+ th
->syn
+ th
->fin
+ len
- th
->doff
*4;
1576 skb
->ack_seq
= ntohl(th
->ack_seq
);
1580 if (!sk
->sock_readers
)
1581 return tcp_v4_do_rcv(sk
, skb
);
1583 __skb_queue_tail(&sk
->back_log
, skb
);
1587 tcp_v4_send_reset(skb
);
1590 /* Discard frame. */
1591 kfree_skb(skb
, FREE_READ
);
1595 int tcp_v4_build_header(struct sock
*sk
, struct sk_buff
*skb
)
1597 return ip_build_header(skb
, sk
);
1600 int tcp_v4_rebuild_header(struct sock
*sk
, struct sk_buff
*skb
)
1606 int want_rewrite
= sysctl_ip_dynaddr
&& sk
->state
== TCP_SYN_SENT
;
1610 rt
= (struct rtable
*)skb
->dst
;
1612 /* Force route checking if want_rewrite */
1615 __u32 old_saddr
= rt
->rt_src
;
1617 /* Query new route */
1618 tmp
= ip_route_connect(&rt
, rt
->rt_dst
, 0,
1619 RT_TOS(sk
->ip_tos
)|(sk
->localroute
||0),
1622 /* Only useful if different source addrs */
1623 if (tmp
== 0 || rt
->rt_src
!= old_saddr
) {
1624 dst_release(skb
->dst
);
1625 skb
->dst
= &rt
->u
.dst
;
1628 dst_release(&rt
->u
.dst
);
1631 if (rt
->u
.dst
.obsolete
) {
1633 err
= ip_route_output(&rt
, rt
->rt_dst
, rt
->rt_src
, rt
->key
.tos
, rt
->key
.oif
);
1636 sk
->error_report(skb
->sk
);
1639 dst_release(skb
->dst
);
1640 skb
->dst
= &rt
->u
.dst
;
1643 /* Discard the surplus MAC header. */
1644 skb_pull(skb
, skb
->nh
.raw
-skb
->data
);
1648 size
= skb
->tail
- skb
->h
.raw
;
1651 __u32 new_saddr
= rt
->rt_src
;
1654 * Ouch!, this should not happen.
1656 if (!sk
->saddr
|| !sk
->rcv_saddr
) {
1657 printk(KERN_WARNING
"tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n",
1659 ntohl(sk
->rcv_saddr
));
1664 * Maybe whe are in a skb chain loop and socket address has
1665 * yet been 'damaged'.
1668 if (new_saddr
!= sk
->saddr
) {
1669 if (sysctl_ip_dynaddr
> 1) {
1670 printk(KERN_INFO
"tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1672 NIPQUAD(new_saddr
));
1675 sk
->saddr
= new_saddr
;
1676 sk
->rcv_saddr
= new_saddr
;
1677 /* sk->prot->rehash(sk); */
1681 if (new_saddr
!= iph
->saddr
) {
1682 if (sysctl_ip_dynaddr
> 1) {
1683 printk(KERN_INFO
"tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1684 NIPQUAD(iph
->saddr
),
1685 NIPQUAD(new_saddr
));
1688 iph
->saddr
= new_saddr
;
1697 static struct sock
* tcp_v4_get_sock(struct sk_buff
*skb
, struct tcphdr
*th
)
1699 return tcp_v4_lookup(skb
->nh
.iph
->saddr
, th
->source
,
1700 skb
->nh
.iph
->daddr
, th
->dest
, skb
->dev
->ifindex
);
1703 static void v4_addr2sockaddr(struct sock
*sk
, struct sockaddr
* uaddr
)
1705 struct sockaddr_in
*sin
= (struct sockaddr_in
*) uaddr
;
1707 sin
->sin_family
= AF_INET
;
1708 sin
->sin_addr
.s_addr
= sk
->daddr
;
1709 sin
->sin_port
= sk
->dummy_th
.dest
;
1712 struct tcp_func ipv4_specific
= {
1713 tcp_v4_build_header
,
1716 tcp_v4_rebuild_header
,
1717 tcp_v4_conn_request
,
1718 tcp_v4_syn_recv_sock
,
1723 sizeof(struct sockaddr_in
)
1726 static int tcp_v4_init_sock(struct sock
*sk
)
1728 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1730 skb_queue_head_init(&sk
->out_of_order_queue
);
1731 tcp_init_xmit_timers(sk
);
1734 tp
->rto
= TCP_TIMEOUT_INIT
; /*TCP_WRITE_TIME*/
1735 tp
->mdev
= TCP_TIMEOUT_INIT
;
1738 tp
->iat
= (HZ
/5) << 3;
1740 /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
1741 /* tp->rcv_wnd = 8192; */
1749 tp
->syn_backlog
= 0;
1752 * See draft-stevens-tcpca-spec-01 for discussion of the
1753 * initialization of these values.
1756 tp
->snd_ssthresh
= 0x7fffffff; /* Infinity */
1759 sk
->state
= TCP_CLOSE
;
1761 sk
->max_ack_backlog
= SOMAXCONN
;
1766 /* Speed up by setting some standard state for the dummy_th. */
1768 sk
->dummy_th
.doff
=sizeof(struct tcphdr
)>>2;
1770 /* Init SYN queue. */
1773 sk
->tp_pinfo
.af_tcp
.af_specific
= &ipv4_specific
;
1778 static int tcp_v4_destroy_sock(struct sock
*sk
)
1780 struct sk_buff
*skb
;
1782 tcp_clear_xmit_timers(sk
);
1785 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE
);
1787 /* Cleanup up the write buffer. */
1788 while((skb
= skb_dequeue(&sk
->write_queue
)) != NULL
)
1789 kfree_skb(skb
, FREE_WRITE
);
1791 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1792 while((skb
= skb_dequeue(&sk
->out_of_order_queue
)) != NULL
)
1793 kfree_skb(skb
, FREE_READ
);
1798 struct proto tcp_prot
= {
1799 (struct sock
*)&tcp_prot
, /* sklist_next */
1800 (struct sock
*)&tcp_prot
, /* sklist_prev */
1801 tcp_close
, /* close */
1802 tcp_v4_connect
, /* connect */
1803 tcp_accept
, /* accept */
1804 NULL
, /* retransmit */
1805 tcp_write_wakeup
, /* write_wakeup */
1806 tcp_read_wakeup
, /* read_wakeup */
1807 tcp_poll
, /* poll */
1808 tcp_ioctl
, /* ioctl */
1809 tcp_v4_init_sock
, /* init */
1810 tcp_v4_destroy_sock
, /* destroy */
1811 tcp_shutdown
, /* shutdown */
1812 tcp_setsockopt
, /* setsockopt */
1813 tcp_getsockopt
, /* getsockopt */
1814 tcp_v4_sendmsg
, /* sendmsg */
1815 tcp_recvmsg
, /* recvmsg */
1817 tcp_v4_do_rcv
, /* backlog_rcv */
1818 tcp_v4_hash
, /* hash */
1819 tcp_v4_unhash
, /* unhash */
1820 tcp_v4_rehash
, /* rehash */
1821 tcp_good_socknum
, /* good_socknum */
1822 tcp_v4_verify_bind
, /* verify_bind */
1823 128, /* max_header */
1824 0, /* retransmits */
1827 0 /* highestinuse */