2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.222 2000/12/08 17:15:53 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/cache.h>
55 #include <linux/init.h>
60 #include <net/inet_common.h>
62 #include <linux/inet.h>
63 #include <linux/stddef.h>
64 #include <linux/ipsec.h>
66 extern int sysctl_ip_dynaddr
;
68 /* Check TCP sequence numbers in ICMP packets. */
69 #define ICMP_MIN_LENGTH 8
71 /* Socket used for sending RSTs */
72 static struct inode tcp_inode
;
73 static struct socket
*tcp_socket
=&tcp_inode
.u
.socket_i
;
75 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
79 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
81 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo
= {
86 __tcp_listening_hash
: { NULL
, },
87 __tcp_lhash_lock
: RW_LOCK_UNLOCKED
,
88 __tcp_lhash_users
: ATOMIC_INIT(0),
90 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo
.__tcp_lhash_wait
),
91 __tcp_portalloc_lock
: SPIN_LOCK_UNLOCKED
95 * This array holds the first and last local port number.
96 * For high-usage systems, use sysctl to change this to
99 int sysctl_local_port_range
[2] = { 1024, 4999 };
100 int tcp_port_rover
= (1024 - 1);
102 static __inline__
int tcp_hashfn(__u32 laddr
, __u16 lport
,
103 __u32 faddr
, __u16 fport
)
105 int h
= ((laddr
^ lport
) ^ (faddr
^ fport
));
108 return h
& (tcp_ehash_size
- 1);
111 static __inline__
int tcp_sk_hashfn(struct sock
*sk
)
113 __u32 laddr
= sk
->rcv_saddr
;
114 __u16 lport
= sk
->num
;
115 __u32 faddr
= sk
->daddr
;
116 __u16 fport
= sk
->dport
;
118 return tcp_hashfn(laddr
, lport
, faddr
, fport
);
121 /* Allocate and initialize a new TCP local port bind bucket.
122 * The bindhash mutex for snum's hash chain must be held here.
124 struct tcp_bind_bucket
*tcp_bucket_create(struct tcp_bind_hashbucket
*head
,
127 struct tcp_bind_bucket
*tb
;
129 tb
= kmem_cache_alloc(tcp_bucket_cachep
, SLAB_ATOMIC
);
134 if((tb
->next
= head
->chain
) != NULL
)
135 tb
->next
->pprev
= &tb
->next
;
137 tb
->pprev
= &head
->chain
;
142 /* Caller must disable local BH processing. */
143 static __inline__
void __tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
145 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(child
->num
)];
146 struct tcp_bind_bucket
*tb
;
148 spin_lock(&head
->lock
);
149 tb
= (struct tcp_bind_bucket
*)sk
->prev
;
150 if ((child
->bind_next
= tb
->owners
) != NULL
)
151 tb
->owners
->bind_pprev
= &child
->bind_next
;
153 child
->bind_pprev
= &tb
->owners
;
154 child
->prev
= (struct sock
*) tb
;
155 spin_unlock(&head
->lock
);
158 __inline__
void tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
161 __tcp_inherit_port(sk
, child
);
165 /* Obtain a reference to a local port for the given sock,
166 * if snum is zero it means select any available local port.
168 static int tcp_v4_get_port(struct sock
*sk
, unsigned short snum
)
170 struct tcp_bind_hashbucket
*head
;
171 struct tcp_bind_bucket
*tb
;
176 int low
= sysctl_local_port_range
[0];
177 int high
= sysctl_local_port_range
[1];
178 int remaining
= (high
- low
) + 1;
181 spin_lock(&tcp_portalloc_lock
);
182 rover
= tcp_port_rover
;
184 if ((rover
< low
) || (rover
> high
))
186 head
= &tcp_bhash
[tcp_bhashfn(rover
)];
187 spin_lock(&head
->lock
);
188 for (tb
= head
->chain
; tb
; tb
= tb
->next
)
189 if (tb
->port
== rover
)
193 spin_unlock(&head
->lock
);
194 } while (--remaining
> 0);
195 tcp_port_rover
= rover
;
196 spin_unlock(&tcp_portalloc_lock
);
198 /* Exhausted local port range during search? */
203 /* OK, here is the one we will use. HEAD is
204 * non-NULL and we hold it's mutex.
209 head
= &tcp_bhash
[tcp_bhashfn(snum
)];
210 spin_lock(&head
->lock
);
211 for (tb
= head
->chain
; tb
!= NULL
; tb
= tb
->next
)
212 if (tb
->port
== snum
)
215 if (tb
!= NULL
&& tb
->owners
!= NULL
) {
216 if (tb
->fastreuse
!= 0 && sk
->reuse
!= 0 && sk
->state
!= TCP_LISTEN
) {
219 struct sock
*sk2
= tb
->owners
;
220 int sk_reuse
= sk
->reuse
;
222 for( ; sk2
!= NULL
; sk2
= sk2
->bind_next
) {
224 sk
->bound_dev_if
== sk2
->bound_dev_if
) {
227 sk2
->state
== TCP_LISTEN
) {
228 if (!sk2
->rcv_saddr
||
230 (sk2
->rcv_saddr
== sk
->rcv_saddr
))
235 /* If we found a conflict, fail. */
243 (tb
= tcp_bucket_create(head
, snum
)) == NULL
)
245 if (tb
->owners
== NULL
) {
246 if (sk
->reuse
&& sk
->state
!= TCP_LISTEN
)
250 } else if (tb
->fastreuse
&&
251 ((sk
->reuse
== 0) || (sk
->state
== TCP_LISTEN
)))
255 if (sk
->prev
== NULL
) {
256 if ((sk
->bind_next
= tb
->owners
) != NULL
)
257 tb
->owners
->bind_pprev
= &sk
->bind_next
;
259 sk
->bind_pprev
= &tb
->owners
;
260 sk
->prev
= (struct sock
*) tb
;
262 BUG_TRAP(sk
->prev
== (struct sock
*) tb
);
267 spin_unlock(&head
->lock
);
273 /* Get rid of any references to a local port held by the
276 __inline__
void __tcp_put_port(struct sock
*sk
)
278 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(sk
->num
)];
279 struct tcp_bind_bucket
*tb
;
281 spin_lock(&head
->lock
);
282 tb
= (struct tcp_bind_bucket
*) sk
->prev
;
284 sk
->bind_next
->bind_pprev
= sk
->bind_pprev
;
285 *(sk
->bind_pprev
) = sk
->bind_next
;
288 if (tb
->owners
== NULL
) {
290 tb
->next
->pprev
= tb
->pprev
;
291 *(tb
->pprev
) = tb
->next
;
292 kmem_cache_free(tcp_bucket_cachep
, tb
);
294 spin_unlock(&head
->lock
);
297 void tcp_put_port(struct sock
*sk
)
304 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
305 * Look, when several writers sleep and reader wakes them up, all but one
306 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
307 * this, _but_ remember, it adds useless work on UP machines (wake up each
308 * exclusive lock release). It should be ifdefed really.
311 void tcp_listen_wlock(void)
313 write_lock(&tcp_lhash_lock
);
315 if (atomic_read(&tcp_lhash_users
)) {
316 DECLARE_WAITQUEUE(wait
, current
);
318 add_wait_queue_exclusive(&tcp_lhash_wait
, &wait
);
320 set_current_state(TASK_UNINTERRUPTIBLE
);
321 if (atomic_read(&tcp_lhash_users
) == 0)
323 write_unlock_bh(&tcp_lhash_lock
);
325 write_lock_bh(&tcp_lhash_lock
);
328 __set_current_state(TASK_RUNNING
);
329 remove_wait_queue(&tcp_lhash_wait
, &wait
);
333 static __inline__
void __tcp_v4_hash(struct sock
*sk
)
338 BUG_TRAP(sk
->pprev
==NULL
);
339 if(sk
->state
== TCP_LISTEN
) {
340 skp
= &tcp_listening_hash
[tcp_sk_listen_hashfn(sk
)];
341 lock
= &tcp_lhash_lock
;
344 skp
= &tcp_ehash
[(sk
->hashent
= tcp_sk_hashfn(sk
))].chain
;
345 lock
= &tcp_ehash
[sk
->hashent
].lock
;
348 if((sk
->next
= *skp
) != NULL
)
349 (*skp
)->pprev
= &sk
->next
;
352 sock_prot_inc_use(sk
->prot
);
354 if (sk
->state
== TCP_LISTEN
)
355 wake_up(&tcp_lhash_wait
);
358 static void tcp_v4_hash(struct sock
*sk
)
360 if (sk
->state
!= TCP_CLOSE
) {
367 void tcp_unhash(struct sock
*sk
)
371 if (sk
->state
== TCP_LISTEN
) {
374 lock
= &tcp_lhash_lock
;
376 struct tcp_ehash_bucket
*head
= &tcp_ehash
[sk
->hashent
];
378 write_lock_bh(&head
->lock
);
383 sk
->next
->pprev
= sk
->pprev
;
384 *sk
->pprev
= sk
->next
;
386 sock_prot_dec_use(sk
->prot
);
388 write_unlock_bh(lock
);
389 if (sk
->state
== TCP_LISTEN
)
390 wake_up(&tcp_lhash_wait
);
393 /* Don't inline this cruft. Here are some nice properties to
394 * exploit here. The BSD API does not allow a listening TCP
395 * to specify the remote port nor the remote address for the
396 * connection. So always assume those are both wildcarded
397 * during the search since they can never be otherwise.
399 static struct sock
*__tcp_v4_lookup_listener(struct sock
*sk
, u32 daddr
, unsigned short hnum
, int dif
)
401 struct sock
*result
= NULL
;
405 for(; sk
; sk
= sk
->next
) {
406 if(sk
->num
== hnum
) {
407 __u32 rcv_saddr
= sk
->rcv_saddr
;
411 if (rcv_saddr
!= daddr
)
415 if (sk
->bound_dev_if
) {
416 if (sk
->bound_dev_if
!= dif
)
422 if (score
> hiscore
) {
431 /* Optimize the common listener case. */
432 __inline__
struct sock
*tcp_v4_lookup_listener(u32 daddr
, unsigned short hnum
, int dif
)
436 read_lock(&tcp_lhash_lock
);
437 sk
= tcp_listening_hash
[tcp_lhashfn(hnum
)];
439 if (sk
->num
== hnum
&&
441 (!sk
->rcv_saddr
|| sk
->rcv_saddr
== daddr
) &&
444 sk
= __tcp_v4_lookup_listener(sk
, daddr
, hnum
, dif
);
450 read_unlock(&tcp_lhash_lock
);
454 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
455 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
457 * Local BH must be disabled here.
460 static inline struct sock
*__tcp_v4_lookup_established(u32 saddr
, u16 sport
,
461 u32 daddr
, u16 hnum
, int dif
)
463 struct tcp_ehash_bucket
*head
;
464 TCP_V4_ADDR_COOKIE(acookie
, saddr
, daddr
)
465 __u32 ports
= TCP_COMBINED_PORTS(sport
, hnum
);
469 /* Optimize here for direct hit, only listening connections can
470 * have wildcards anyways.
472 hash
= tcp_hashfn(daddr
, hnum
, saddr
, sport
);
473 head
= &tcp_ehash
[hash
];
474 read_lock(&head
->lock
);
475 for(sk
= head
->chain
; sk
; sk
= sk
->next
) {
476 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
477 goto hit
; /* You sunk my battleship! */
480 /* Must check for a TIME_WAIT'er before going to listener hash. */
481 for(sk
= (head
+ tcp_ehash_size
)->chain
; sk
; sk
= sk
->next
)
482 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
484 read_unlock(&head
->lock
);
490 read_unlock(&head
->lock
);
494 static inline struct sock
*__tcp_v4_lookup(u32 saddr
, u16 sport
,
495 u32 daddr
, u16 hnum
, int dif
)
499 sk
= __tcp_v4_lookup_established(saddr
, sport
, daddr
, hnum
, dif
);
504 return tcp_v4_lookup_listener(daddr
, hnum
, dif
);
507 __inline__
struct sock
*tcp_v4_lookup(u32 saddr
, u16 sport
, u32 daddr
, u16 dport
, int dif
)
512 sk
= __tcp_v4_lookup(saddr
, sport
, daddr
, ntohs(dport
), dif
);
518 static inline __u32
tcp_v4_init_sequence(struct sock
*sk
, struct sk_buff
*skb
)
520 return secure_tcp_sequence_number(skb
->nh
.iph
->daddr
,
526 static int tcp_v4_check_established(struct sock
*sk
)
528 u32 daddr
= sk
->rcv_saddr
;
529 u32 saddr
= sk
->daddr
;
530 int dif
= sk
->bound_dev_if
;
531 TCP_V4_ADDR_COOKIE(acookie
, saddr
, daddr
)
532 __u32 ports
= TCP_COMBINED_PORTS(sk
->dport
, sk
->num
);
533 int hash
= tcp_hashfn(daddr
, sk
->num
, saddr
, sk
->dport
);
534 struct tcp_ehash_bucket
*head
= &tcp_ehash
[hash
];
535 struct sock
*sk2
, **skp
;
536 struct tcp_tw_bucket
*tw
;
538 write_lock_bh(&head
->lock
);
540 /* Check TIME-WAIT sockets first. */
541 for(skp
= &(head
+ tcp_ehash_size
)->chain
; (sk2
=*skp
) != NULL
;
543 tw
= (struct tcp_tw_bucket
*)sk2
;
545 if(TCP_IPV4_MATCH(sk2
, acookie
, saddr
, daddr
, ports
, dif
)) {
546 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
548 /* With PAWS, it is safe from the viewpoint
549 of data integrity. Even without PAWS it
550 is safe provided sequence spaces do not
551 overlap i.e. at data rates <= 80Mbit/sec.
553 Actually, the idea is close to VJ's one,
554 only timestamp cache is held not per host,
555 but per port pair and TW bucket is used
558 If TW bucket has been already destroyed we
559 fall back to VJ's scheme and use initial
560 timestamp retrieved from peer table.
562 if (tw
->ts_recent_stamp
) {
563 if ((tp
->write_seq
= tw
->snd_nxt
+65535+2) == 0)
565 tp
->ts_recent
= tw
->ts_recent
;
566 tp
->ts_recent_stamp
= tw
->ts_recent_stamp
;
576 /* And established part... */
577 for(skp
= &head
->chain
; (sk2
=*skp
)!=NULL
; skp
= &sk2
->next
) {
578 if(TCP_IPV4_MATCH(sk2
, acookie
, saddr
, daddr
, ports
, dif
))
583 BUG_TRAP(sk
->pprev
==NULL
);
584 if ((sk
->next
= *skp
) != NULL
)
585 (*skp
)->pprev
= &sk
->next
;
590 sock_prot_inc_use(sk
->prot
);
591 write_unlock_bh(&head
->lock
);
594 /* Silly. Should hash-dance instead... */
596 tcp_tw_deschedule(tw
);
597 tcp_timewait_kill(tw
);
598 NET_INC_STATS_BH(TimeWaitRecycled
);
607 write_unlock_bh(&head
->lock
);
608 return -EADDRNOTAVAIL
;
611 /* Hash SYN-SENT socket to established hash table after
612 * checking that it is unique. Note, that without kernel lock
613 * we MUST make these two operations atomically.
615 * Optimization: if it is bound and tcp_bind_bucket has the only
616 * owner (us), we need not to scan established bucket.
619 int tcp_v4_hash_connecting(struct sock
*sk
)
621 unsigned short snum
= sk
->num
;
622 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(snum
)];
623 struct tcp_bind_bucket
*tb
= (struct tcp_bind_bucket
*)sk
->prev
;
625 spin_lock_bh(&head
->lock
);
626 if (tb
->owners
== sk
&& sk
->bind_next
== NULL
) {
628 spin_unlock_bh(&head
->lock
);
631 spin_unlock_bh(&head
->lock
);
633 /* No definite answer... Walk to established hash table */
634 return tcp_v4_check_established(sk
);
638 /* This will initiate an outgoing connection. */
639 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
641 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
642 struct sockaddr_in
*usin
= (struct sockaddr_in
*) uaddr
;
643 struct sk_buff
*buff
;
649 if (addr_len
< sizeof(struct sockaddr_in
))
652 if (usin
->sin_family
!= AF_INET
)
653 return(-EAFNOSUPPORT
);
655 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
656 if (sk
->protinfo
.af_inet
.opt
&& sk
->protinfo
.af_inet
.opt
->srr
) {
659 nexthop
= sk
->protinfo
.af_inet
.opt
->faddr
;
662 tmp
= ip_route_connect(&rt
, nexthop
, sk
->saddr
,
663 RT_TOS(sk
->protinfo
.af_inet
.tos
)|RTO_CONN
|sk
->localroute
, sk
->bound_dev_if
);
667 if (rt
->rt_flags
&(RTCF_MULTICAST
|RTCF_BROADCAST
)) {
672 __sk_dst_set(sk
, &rt
->u
.dst
);
674 if (!sk
->protinfo
.af_inet
.opt
|| !sk
->protinfo
.af_inet
.opt
->srr
)
678 buff
= alloc_skb(MAX_TCP_HEADER
+ 15, GFP_KERNEL
);
684 sk
->saddr
= rt
->rt_src
;
685 sk
->rcv_saddr
= sk
->saddr
;
687 if (tp
->ts_recent_stamp
&& sk
->daddr
!= daddr
) {
688 /* Reset inherited state */
690 tp
->ts_recent_stamp
= 0;
694 if (sysctl_tcp_tw_recycle
&&
695 !tp
->ts_recent_stamp
&&
696 rt
->rt_dst
== daddr
) {
697 struct inet_peer
*peer
= rt_get_peer(rt
);
699 /* VJ's idea. We save last timestamp seen from
700 * the destination in peer table, when entering state TIME-WAIT
701 * and initialize ts_recent from it, when trying new connection.
704 if (peer
&& peer
->tcp_ts_stamp
+ TCP_PAWS_MSL
>= xtime
.tv_sec
) {
705 tp
->ts_recent_stamp
= peer
->tcp_ts_stamp
;
706 tp
->ts_recent
= peer
->tcp_ts
;
710 sk
->dport
= usin
->sin_port
;
714 tp
->write_seq
= secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
715 sk
->sport
, usin
->sin_port
);
717 tp
->ext_header_len
= 0;
718 if (sk
->protinfo
.af_inet
.opt
)
719 tp
->ext_header_len
= sk
->protinfo
.af_inet
.opt
->optlen
;
723 err
= tcp_connect(sk
, buff
);
733 static __inline__
int tcp_v4_iif(struct sk_buff
*skb
)
735 return ((struct rtable
*)skb
->dst
)->rt_iif
;
738 static __inline__
unsigned tcp_v4_synq_hash(u32 raddr
, u16 rport
)
740 unsigned h
= raddr
^ rport
;
743 return h
&(TCP_SYNQ_HSIZE
-1);
746 static struct open_request
*tcp_v4_search_req(struct tcp_opt
*tp
,
749 struct open_request
***prevp
)
751 struct tcp_listen_opt
*lopt
= tp
->listen_opt
;
752 struct open_request
*req
, **prev
;
753 __u16 rport
= th
->source
;
754 __u32 raddr
= iph
->saddr
;
756 for (prev
= &lopt
->syn_table
[tcp_v4_synq_hash(raddr
, rport
)];
757 (req
= *prev
) != NULL
;
758 prev
= &req
->dl_next
) {
759 if (req
->rmt_port
== rport
&&
760 req
->af
.v4_req
.rmt_addr
== raddr
&&
761 req
->af
.v4_req
.loc_addr
== iph
->daddr
&&
762 TCP_INET_FAMILY(req
->class->family
)) {
763 BUG_TRAP(req
->sk
== NULL
);
772 static void tcp_v4_synq_add(struct sock
*sk
, struct open_request
*req
)
774 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
775 struct tcp_listen_opt
*lopt
= tp
->listen_opt
;
776 unsigned h
= tcp_v4_synq_hash(req
->af
.v4_req
.rmt_addr
, req
->rmt_port
);
778 req
->expires
= jiffies
+ TCP_TIMEOUT_INIT
;
782 req
->dl_next
= lopt
->syn_table
[h
];
784 write_lock(&tp
->syn_wait_lock
);
785 lopt
->syn_table
[h
] = req
;
786 write_unlock(&tp
->syn_wait_lock
);
793 * This routine does path mtu discovery as defined in RFC1191.
795 static inline void do_pmtu_discovery(struct sock
*sk
, struct iphdr
*ip
, unsigned mtu
)
797 struct dst_entry
*dst
;
798 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
800 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
801 * send out by Linux are always <576bytes so they should go through
804 if (sk
->state
== TCP_LISTEN
)
807 /* We don't check in the destentry if pmtu discovery is forbidden
808 * on this route. We just assume that no packet_to_big packets
809 * are send back when pmtu discovery is not active.
810 * There is a small race when the user changes this flag in the
811 * route, but I think that's acceptable.
813 if ((dst
= __sk_dst_check(sk
, 0)) == NULL
)
816 ip_rt_update_pmtu(dst
, mtu
);
818 /* Something is about to be wrong... Remember soft error
819 * for the case, if this connection will not able to recover.
821 if (mtu
< dst
->pmtu
&& ip_dont_fragment(sk
, dst
))
822 sk
->err_soft
= EMSGSIZE
;
824 if (sk
->protinfo
.af_inet
.pmtudisc
!= IP_PMTUDISC_DONT
&&
825 tp
->pmtu_cookie
> dst
->pmtu
) {
826 tcp_sync_mss(sk
, dst
->pmtu
);
828 /* Resend the TCP packet because it's
829 * clear that the old packet has been
830 * dropped. This is the new "fast" path mtu
833 tcp_simple_retransmit(sk
);
834 } /* else let the usual retransmit timer handle it */
838 * This routine is called by the ICMP module when it gets some
839 * sort of error condition. If err < 0 then the socket should
840 * be closed and the error returned to the user. If err > 0
841 * it's just the icmp type << 8 | icmp code. After adjustment
842 * header points to the first 8 bytes of the tcp header. We need
843 * to find the appropriate port.
845 * The locking strategy used here is very "optimistic". When
846 * someone else accesses the socket the ICMP is just dropped
847 * and for some paths there is no check at all.
848 * A more general error queue to queue errors for later handling
849 * is probably better.
853 void tcp_v4_err(struct sk_buff
*skb
, unsigned char *dp
, int len
)
855 struct iphdr
*iph
= (struct iphdr
*)dp
;
858 int type
= skb
->h
.icmph
->type
;
859 int code
= skb
->h
.icmph
->code
;
860 #if ICMP_MIN_LENGTH < 14
869 if (len
< (iph
->ihl
<< 2) + ICMP_MIN_LENGTH
) {
870 ICMP_INC_STATS_BH(IcmpInErrors
);
873 #if ICMP_MIN_LENGTH < 14
874 if (len
< (iph
->ihl
<< 2) + 14)
878 th
= (struct tcphdr
*)(dp
+(iph
->ihl
<<2));
880 sk
= tcp_v4_lookup(iph
->daddr
, th
->dest
, iph
->saddr
, th
->source
, tcp_v4_iif(skb
));
882 ICMP_INC_STATS_BH(IcmpInErrors
);
885 if (sk
->state
== TCP_TIME_WAIT
) {
886 tcp_tw_put((struct tcp_tw_bucket
*)sk
);
891 /* If too many ICMPs get dropped on busy
892 * servers this needs to be solved differently.
894 if (sk
->lock
.users
!= 0)
895 NET_INC_STATS_BH(LockDroppedIcmps
);
897 if (sk
->state
== TCP_CLOSE
)
900 tp
= &sk
->tp_pinfo
.af_tcp
;
901 seq
= ntohl(th
->seq
);
902 if (sk
->state
!= TCP_LISTEN
&& !between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
903 NET_INC_STATS(OutOfWindowIcmps
);
908 case ICMP_SOURCE_QUENCH
:
909 /* This is deprecated, but if someone generated it,
910 * we have no reasons to ignore it.
912 if (sk
->lock
.users
== 0)
915 case ICMP_PARAMETERPROB
:
918 case ICMP_DEST_UNREACH
:
919 if (code
> NR_ICMP_UNREACH
)
922 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
923 if (sk
->lock
.users
== 0)
924 do_pmtu_discovery(sk
, iph
, ntohs(skb
->h
.icmph
->un
.frag
.mtu
));
928 err
= icmp_err_convert
[code
].errno
;
930 case ICMP_TIME_EXCEEDED
:
938 struct open_request
*req
, **prev
;
940 if (sk
->lock
.users
!= 0)
943 /* The final ACK of the handshake should be already
944 * handled in the new socket context, not here.
945 * Strictly speaking - an ICMP error for the final
946 * ACK should set the opening flag, but that is too
947 * complicated right now.
949 if (!no_flags
&& !th
->syn
&& !th
->ack
)
952 req
= tcp_v4_search_req(tp
, iph
, th
, &prev
);
956 /* ICMPs are not backlogged, hence we cannot get
957 an established socket here.
959 BUG_TRAP(req
->sk
== NULL
);
961 if (seq
!= req
->snt_isn
) {
962 NET_INC_STATS_BH(OutOfWindowIcmps
);
967 * Still in SYN_RECV, just remove it silently.
968 * There is no good way to pass the error to the newly
969 * created socket, and POSIX does not want network
970 * errors returned from accept().
972 tcp_synq_drop(sk
, req
, prev
);
976 case TCP_SYN_RECV
: /* Cannot happen.
977 It can f.e. if SYNs crossed.
979 if (!no_flags
&& !th
->syn
)
981 if (sk
->lock
.users
== 0) {
982 TCP_INC_STATS_BH(TcpAttemptFails
);
985 sk
->error_report(sk
);
994 /* If we've already connected we will keep trying
995 * until we time out, or the user gives up.
997 * rfc1122 4.2.3.9 allows to consider as hard errors
998 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
999 * but it is obsoleted by pmtu discovery).
1001 * Note, that in modern internet, where routing is unreliable
1002 * and in each dark corner broken firewalls sit, sending random
1003 * errors ordered by their masters even this two messages finally lose
1004 * their original sense (even Linux sends invalid PORT_UNREACHs)
1006 * Now we are in compliance with RFCs.
1010 if (sk
->lock
.users
== 0 && sk
->protinfo
.af_inet
.recverr
) {
1012 sk
->error_report(sk
);
1013 } else { /* Only an error on timeout */
1022 /* This routine computes an IPv4 TCP checksum. */
1023 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
1024 struct sk_buff
*skb
)
1026 th
->check
= tcp_v4_check(th
, len
, sk
->saddr
, sk
->daddr
,
1027 csum_partial((char *)th
, th
->doff
<<2, skb
->csum
));
1031 * This routine will send an RST to the other tcp.
1033 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1035 * Answer: if a packet caused RST, it is not for a socket
1036 * existing in our system, if it is matched to a socket,
1037 * it is just duplicate segment or bug in other side's TCP.
1038 * So that we build reply only basing on parameters
1039 * arrived with segment.
1040 * Exception: precedence violation. We do not implement it in any case.
1043 static void tcp_v4_send_reset(struct sk_buff
*skb
)
1045 struct tcphdr
*th
= skb
->h
.th
;
1047 struct ip_reply_arg arg
;
1049 /* Never send a reset in response to a reset. */
1053 if (((struct rtable
*)skb
->dst
)->rt_type
!= RTN_LOCAL
)
1056 /* Swap the send and the receive. */
1057 memset(&rth
, 0, sizeof(struct tcphdr
));
1058 rth
.dest
= th
->source
;
1059 rth
.source
= th
->dest
;
1060 rth
.doff
= sizeof(struct tcphdr
)/4;
1064 rth
.seq
= th
->ack_seq
;
1067 rth
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
1068 + skb
->len
- (th
->doff
<<2));
1071 memset(&arg
, 0, sizeof arg
);
1072 arg
.iov
[0].iov_base
= (unsigned char *)&rth
;
1073 arg
.iov
[0].iov_len
= sizeof rth
;
1074 arg
.csum
= csum_tcpudp_nofold(skb
->nh
.iph
->daddr
,
1075 skb
->nh
.iph
->saddr
, /*XXX*/
1076 sizeof(struct tcphdr
),
1080 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
1082 ip_send_reply(tcp_socket
->sk
, skb
, &arg
, sizeof rth
);
1084 TCP_INC_STATS_BH(TcpOutSegs
);
1085 TCP_INC_STATS_BH(TcpOutRsts
);
1088 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1089 outside socket context is ugly, certainly. What can I do?
1092 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
, u32 win
, u32 ts
)
1094 struct tcphdr
*th
= skb
->h
.th
;
1099 struct ip_reply_arg arg
;
1101 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
1102 memset(&arg
, 0, sizeof arg
);
1104 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
1105 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
1108 rep
.tsopt
[0] = __constant_htonl((TCPOPT_NOP
<< 24) |
1109 (TCPOPT_NOP
<< 16) |
1110 (TCPOPT_TIMESTAMP
<< 8) |
1112 rep
.tsopt
[1] = htonl(tcp_time_stamp
);
1113 rep
.tsopt
[2] = htonl(ts
);
1114 arg
.iov
[0].iov_len
= sizeof(rep
);
1117 /* Swap the send and the receive. */
1118 rep
.th
.dest
= th
->source
;
1119 rep
.th
.source
= th
->dest
;
1120 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
1121 rep
.th
.seq
= htonl(seq
);
1122 rep
.th
.ack_seq
= htonl(ack
);
1124 rep
.th
.window
= htons(win
);
1126 arg
.csum
= csum_tcpudp_nofold(skb
->nh
.iph
->daddr
,
1127 skb
->nh
.iph
->saddr
, /*XXX*/
1131 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
1133 ip_send_reply(tcp_socket
->sk
, skb
, &arg
, arg
.iov
[0].iov_len
);
1135 TCP_INC_STATS_BH(TcpOutSegs
);
1138 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
1140 struct tcp_tw_bucket
*tw
= (struct tcp_tw_bucket
*)sk
;
1142 tcp_v4_send_ack(skb
, tw
->snd_nxt
, tw
->rcv_nxt
,
1143 tw
->rcv_wnd
>>tw
->rcv_wscale
, tw
->ts_recent
);
1148 static void tcp_v4_or_send_ack(struct sk_buff
*skb
, struct open_request
*req
)
1150 tcp_v4_send_ack(skb
, req
->snt_isn
+1, req
->rcv_isn
+1, req
->rcv_wnd
,
1154 static struct dst_entry
* tcp_v4_route_req(struct sock
*sk
, struct open_request
*req
)
1157 struct ip_options
*opt
;
1159 opt
= req
->af
.v4_req
.opt
;
1160 if(ip_route_output(&rt
, ((opt
&& opt
->srr
) ?
1162 req
->af
.v4_req
.rmt_addr
),
1163 req
->af
.v4_req
.loc_addr
,
1164 RT_TOS(sk
->protinfo
.af_inet
.tos
) | RTO_CONN
| sk
->localroute
,
1165 sk
->bound_dev_if
)) {
1166 IP_INC_STATS_BH(IpOutNoRoutes
);
1169 if (opt
&& opt
->is_strictroute
&& rt
->rt_dst
!= rt
->rt_gateway
) {
1171 IP_INC_STATS_BH(IpOutNoRoutes
);
1178 * Send a SYN-ACK after having received an ACK.
1179 * This still operates on a open_request only, not on a big
1182 static int tcp_v4_send_synack(struct sock
*sk
, struct open_request
*req
,
1183 struct dst_entry
*dst
)
1186 struct sk_buff
* skb
;
1188 /* First, grab a route. */
1190 (dst
= tcp_v4_route_req(sk
, req
)) == NULL
)
1193 skb
= tcp_make_synack(sk
, dst
, req
);
1196 struct tcphdr
*th
= skb
->h
.th
;
1198 th
->check
= tcp_v4_check(th
, skb
->len
,
1199 req
->af
.v4_req
.loc_addr
, req
->af
.v4_req
.rmt_addr
,
1200 csum_partial((char *)th
, skb
->len
, skb
->csum
));
1202 err
= ip_build_and_send_pkt(skb
, sk
, req
->af
.v4_req
.loc_addr
,
1203 req
->af
.v4_req
.rmt_addr
, req
->af
.v4_req
.opt
);
1204 if (err
== NET_XMIT_CN
)
1214 * IPv4 open_request destructor.
1216 static void tcp_v4_or_free(struct open_request
*req
)
1218 if (req
->af
.v4_req
.opt
)
1219 kfree(req
->af
.v4_req
.opt
);
1222 static inline void syn_flood_warning(struct sk_buff
*skb
)
1224 static unsigned long warntime
;
1226 if (jiffies
- warntime
> HZ
*60) {
1229 "possible SYN flooding on port %d. Sending cookies.\n",
1230 ntohs(skb
->h
.th
->dest
));
1235 * Save and compile IPv4 options into the open_request if needed.
1237 static inline struct ip_options
*
1238 tcp_v4_save_options(struct sock
*sk
, struct sk_buff
*skb
)
1240 struct ip_options
*opt
= &(IPCB(skb
)->opt
);
1241 struct ip_options
*dopt
= NULL
;
1243 if (opt
&& opt
->optlen
) {
1244 int opt_size
= optlength(opt
);
1245 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
1247 if (ip_options_echo(dopt
, skb
)) {
1257 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1258 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1259 * It would be better to replace it with a global counter for all sockets
1260 * but then some measure against one socket starving all other sockets
1263 * It was 128 by default. Experiments with real servers show, that
1264 * it is absolutely not enough even at 100conn/sec. 256 cures most
1265 * of problems. This value is adjusted to 128 for very small machines
1266 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1267 * Further increasing requires to change hash table size.
1269 int sysctl_max_syn_backlog
= 256;
1271 struct or_calltable or_ipv4
= {
1279 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1282 struct open_request
*req
;
1283 __u32 saddr
= skb
->nh
.iph
->saddr
;
1284 __u32 daddr
= skb
->nh
.iph
->daddr
;
1285 __u32 isn
= TCP_SKB_CB(skb
)->when
;
1286 struct dst_entry
*dst
= NULL
;
1287 #ifdef CONFIG_SYN_COOKIES
1288 int want_cookie
= 0;
1290 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1293 /* Never answer to SYNs send to broadcast or multicast */
1294 if (((struct rtable
*)skb
->dst
)->rt_flags
&
1295 (RTCF_BROADCAST
|RTCF_MULTICAST
))
1298 /* TW buckets are converted to open requests without
1299 * limitations, they conserve resources and peer is
1300 * evidently real one.
1302 if (tcp_synq_is_full(sk
) && !isn
) {
1303 #ifdef CONFIG_SYN_COOKIES
1304 if (sysctl_tcp_syncookies
) {
1311 /* Accept backlog is full. If we have already queued enough
1312 * of warm entries in syn queue, drop request. It is better than
1313 * clogging syn queue with openreqs with exponentially increasing
1316 if (tcp_acceptq_is_full(sk
) && tcp_synq_young(sk
) > 1)
1319 req
= tcp_openreq_alloc();
1323 tcp_clear_options(&tp
);
1325 tp
.user_mss
= sk
->tp_pinfo
.af_tcp
.user_mss
;
1327 tcp_parse_options(skb
, &tp
, 0);
1330 tcp_clear_options(&tp
);
1334 if (tp
.saw_tstamp
&& tp
.rcv_tsval
== 0) {
1335 /* Some OSes (unknown ones, but I see them on web server, which
1336 * contains information interesting only for windows'
1337 * users) do not send their stamp in SYN. It is easy case.
1338 * We simply do not advertise TS support.
1343 tp
.tstamp_ok
= tp
.saw_tstamp
;
1345 tcp_openreq_init(req
, &tp
, skb
);
1347 req
->af
.v4_req
.loc_addr
= daddr
;
1348 req
->af
.v4_req
.rmt_addr
= saddr
;
1349 req
->af
.v4_req
.opt
= tcp_v4_save_options(sk
, skb
);
1350 req
->class = &or_ipv4
;
1352 TCP_ECN_create_request(req
, skb
->h
.th
);
1355 #ifdef CONFIG_SYN_COOKIES
1356 syn_flood_warning(skb
);
1358 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1359 } else if (isn
== 0) {
1360 struct inet_peer
*peer
= NULL
;
1362 /* VJ's idea. We save last timestamp seen
1363 * from the destination in peer table, when entering
1364 * state TIME-WAIT, and check against it before
1365 * accepting new connection request.
1367 * If "isn" is not zero, this request hit alive
1368 * timewait bucket, so that all the necessary checks
1369 * are made in the function processing timewait state.
1371 if (tp
.saw_tstamp
&&
1372 sysctl_tcp_tw_recycle
&&
1373 (dst
= tcp_v4_route_req(sk
, req
)) != NULL
&&
1374 (peer
= rt_get_peer((struct rtable
*)dst
)) != NULL
&&
1375 peer
->v4daddr
== saddr
) {
1376 if (xtime
.tv_sec
< peer
->tcp_ts_stamp
+ TCP_PAWS_MSL
&&
1377 (s32
)(peer
->tcp_ts
- req
->ts_recent
) > TCP_PAWS_WINDOW
) {
1378 NET_INC_STATS_BH(PAWSPassiveRejected
);
1383 /* Kill the following clause, if you dislike this way. */
1384 else if (!sysctl_tcp_syncookies
&&
1385 (sysctl_max_syn_backlog
- tcp_synq_len(sk
)
1386 < (sysctl_max_syn_backlog
>>2)) &&
1387 (!peer
|| !peer
->tcp_ts_stamp
) &&
1388 (!dst
|| !dst
->rtt
)) {
1389 /* Without syncookies last quarter of
1390 * backlog is filled with destinations, proven to be alive.
1391 * It means that we continue to communicate
1392 * to destinations, already remembered
1393 * to the moment of synflood.
1395 NETDEBUG(if (net_ratelimit()) \
1396 printk(KERN_DEBUG
"TCP: drop open request from %u.%u.%u.%u/%u\n", \
1397 NIPQUAD(saddr
), ntohs(skb
->h
.th
->source
)));
1398 TCP_INC_STATS_BH(TcpAttemptFails
);
1403 isn
= tcp_v4_init_sequence(sk
, skb
);
1407 if (tcp_v4_send_synack(sk
, req
, dst
))
1411 tcp_openreq_free(req
);
1413 tcp_v4_synq_add(sk
, req
);
1418 tcp_openreq_free(req
);
1420 TCP_INC_STATS_BH(TcpAttemptFails
);
1426 * The three way handshake has completed - we got a valid synack -
1427 * now create the new socket.
1429 struct sock
* tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1430 struct open_request
*req
,
1431 struct dst_entry
*dst
)
1433 struct tcp_opt
*newtp
;
1436 if (tcp_acceptq_is_full(sk
))
1440 (dst
= tcp_v4_route_req(sk
, req
)) == NULL
)
1443 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1447 newsk
->dst_cache
= dst
;
1449 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
1450 newsk
->daddr
= req
->af
.v4_req
.rmt_addr
;
1451 newsk
->saddr
= req
->af
.v4_req
.loc_addr
;
1452 newsk
->rcv_saddr
= req
->af
.v4_req
.loc_addr
;
1453 newsk
->protinfo
.af_inet
.opt
= req
->af
.v4_req
.opt
;
1454 req
->af
.v4_req
.opt
= NULL
;
1455 newsk
->protinfo
.af_inet
.mc_index
= tcp_v4_iif(skb
);
1456 newsk
->protinfo
.af_inet
.mc_ttl
= skb
->nh
.iph
->ttl
;
1457 newtp
->ext_header_len
= 0;
1458 if (newsk
->protinfo
.af_inet
.opt
)
1459 newtp
->ext_header_len
= newsk
->protinfo
.af_inet
.opt
->optlen
;
1461 tcp_sync_mss(newsk
, dst
->pmtu
);
1462 newtp
->advmss
= dst
->advmss
;
1463 tcp_initialize_rcv_mss(newsk
);
1465 __tcp_v4_hash(newsk
);
1466 __tcp_inherit_port(sk
, newsk
);
1471 NET_INC_STATS_BH(ListenOverflows
);
1473 NET_INC_STATS_BH(ListenDrops
);
1478 static struct sock
*tcp_v4_hnd_req(struct sock
*sk
,struct sk_buff
*skb
)
1480 struct open_request
*req
, **prev
;
1481 struct tcphdr
*th
= skb
->h
.th
;
1482 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1485 /* Find possible connection requests. */
1486 req
= tcp_v4_search_req(tp
, skb
->nh
.iph
, th
, &prev
);
1488 return tcp_check_req(sk
, skb
, req
, prev
);
1490 nsk
= __tcp_v4_lookup_established(skb
->nh
.iph
->saddr
,
1497 if (nsk
->state
!= TCP_TIME_WAIT
) {
1501 tcp_tw_put((struct tcp_tw_bucket
*)sk
);
1505 #ifdef CONFIG_SYN_COOKIES
1506 if (!th
->rst
&& !th
->syn
&& th
->ack
)
1507 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1512 static int tcp_v4_checksum_init(struct sk_buff
*skb
)
1514 if (skb
->ip_summed
== CHECKSUM_HW
) {
1515 if (tcp_v4_check(skb
->h
.th
,skb
->len
,skb
->nh
.iph
->saddr
,
1516 skb
->nh
.iph
->daddr
,skb
->csum
)) {
1517 NETDEBUG(printk(KERN_DEBUG
"hw tcp v4 csum failed\n"));
1520 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1522 if (skb
->len
<= 76) {
1523 if (tcp_v4_check(skb
->h
.th
,skb
->len
,skb
->nh
.iph
->saddr
,
1525 csum_partial((char *)skb
->h
.th
, skb
->len
, 0)))
1527 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1529 skb
->csum
= ~tcp_v4_check(skb
->h
.th
,skb
->len
,skb
->nh
.iph
->saddr
,
1530 skb
->nh
.iph
->daddr
,0);
1537 /* The socket must have it's spinlock held when we get
1540 * We have a potential double-lock case here, so even when
1541 * doing backlog processing we use the BH locking scheme.
1542 * This is because we cannot sleep with the original spinlock
1545 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1547 #ifdef CONFIG_FILTER
1548 struct sk_filter
*filter
= sk
->filter
;
1549 if (filter
&& sk_filter(skb
, filter
))
1551 #endif /* CONFIG_FILTER */
1553 IP_INC_STATS_BH(IpInDelivers
);
1555 if (sk
->state
== TCP_ESTABLISHED
) { /* Fast path */
1556 TCP_CHECK_TIMER(sk
);
1557 if (tcp_rcv_established(sk
, skb
, skb
->h
.th
, skb
->len
))
1559 TCP_CHECK_TIMER(sk
);
1563 if (skb
->len
< (skb
->h
.th
->doff
<<2) || tcp_checksum_complete(skb
))
1566 if (sk
->state
== TCP_LISTEN
) {
1567 struct sock
*nsk
= tcp_v4_hnd_req(sk
, skb
);
1572 if (tcp_child_process(sk
, nsk
, skb
))
1578 TCP_CHECK_TIMER(sk
);
1579 if (tcp_rcv_state_process(sk
, skb
, skb
->h
.th
, skb
->len
))
1581 TCP_CHECK_TIMER(sk
);
1585 tcp_v4_send_reset(skb
);
1588 /* Be careful here. If this function gets more complicated and
1589 * gcc suffers from register pressure on the x86, sk (in %ebx)
1590 * might be destroyed here. This current version compiles correctly,
1591 * but you have been warned.
1596 TCP_INC_STATS_BH(TcpInErrs
);
1604 int tcp_v4_rcv(struct sk_buff
*skb
, unsigned short len
)
1610 if (skb
->pkt_type
!=PACKET_HOST
)
1615 /* Pull up the IP header. */
1616 __skb_pull(skb
, skb
->h
.raw
- skb
->data
);
1618 /* Count it even if it's bad */
1619 TCP_INC_STATS_BH(TcpInSegs
);
1621 /* An explanation is required here, I think.
1622 * Packet length and doff are validated by header prediction,
1623 * provided case of th->doff==0 is elimineted.
1624 * So, we defer the checks. */
1625 if (th
->doff
< sizeof(struct tcphdr
)/4 ||
1626 (skb
->ip_summed
!= CHECKSUM_UNNECESSARY
&&
1627 tcp_v4_checksum_init(skb
) < 0))
1630 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1631 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1633 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1634 TCP_SKB_CB(skb
)->when
= 0;
1635 TCP_SKB_CB(skb
)->flags
= skb
->nh
.iph
->tos
;
1636 TCP_SKB_CB(skb
)->sacked
= 0;
1639 sk
= __tcp_v4_lookup(skb
->nh
.iph
->saddr
, th
->source
,
1640 skb
->nh
.iph
->daddr
, ntohs(th
->dest
), tcp_v4_iif(skb
));
1646 if(!ipsec_sk_policy(sk
,skb
))
1647 goto discard_and_relse
;
1649 if (sk
->state
== TCP_TIME_WAIT
)
1656 if (!sk
->lock
.users
) {
1657 if (!tcp_prequeue(sk
, skb
))
1658 ret
= tcp_v4_do_rcv(sk
, skb
);
1660 sk_add_backlog(sk
, skb
);
1668 if (len
< (th
->doff
<<2) || tcp_checksum_complete(skb
)) {
1670 TCP_INC_STATS_BH(TcpInErrs
);
1672 tcp_v4_send_reset(skb
);
1676 /* Discard frame. */
1685 if (len
< (th
->doff
<<2) || tcp_checksum_complete(skb
)) {
1686 TCP_INC_STATS_BH(TcpInErrs
);
1687 goto discard_and_relse
;
1689 switch(tcp_timewait_state_process((struct tcp_tw_bucket
*)sk
,
1690 skb
, th
, skb
->len
)) {
1695 sk2
= tcp_v4_lookup_listener(skb
->nh
.iph
->daddr
, ntohs(th
->dest
), tcp_v4_iif(skb
));
1697 tcp_tw_deschedule((struct tcp_tw_bucket
*)sk
);
1698 tcp_timewait_kill((struct tcp_tw_bucket
*)sk
);
1699 tcp_tw_put((struct tcp_tw_bucket
*)sk
);
1703 /* Fall through to ACK */
1706 tcp_v4_timewait_ack(sk
, skb
);
1710 case TCP_TW_SUCCESS
:
1715 /* With per-bucket locks this operation is not-atomic, so that
1716 * this version is not worse.
1718 static void __tcp_v4_rehash(struct sock
*sk
)
1720 sk
->prot
->unhash(sk
);
1724 static int tcp_v4_reselect_saddr(struct sock
*sk
)
1728 __u32 old_saddr
= sk
->saddr
;
1730 __u32 daddr
= sk
->daddr
;
1732 if(sk
->protinfo
.af_inet
.opt
&& sk
->protinfo
.af_inet
.opt
->srr
)
1733 daddr
= sk
->protinfo
.af_inet
.opt
->faddr
;
1735 /* Query new route. */
1736 err
= ip_route_connect(&rt
, daddr
, 0,
1737 RT_TOS(sk
->protinfo
.af_inet
.tos
)|sk
->localroute
,
1742 __sk_dst_set(sk
, &rt
->u
.dst
);
1743 /* sk->route_caps = rt->u.dst.dev->features; */
1745 new_saddr
= rt
->rt_src
;
1747 if (new_saddr
== old_saddr
)
1750 if (sysctl_ip_dynaddr
> 1) {
1751 printk(KERN_INFO
"tcp_v4_rebuild_header(): shifting sk->saddr "
1752 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1754 NIPQUAD(new_saddr
));
1757 sk
->saddr
= new_saddr
;
1758 sk
->rcv_saddr
= new_saddr
;
1760 /* XXX The only one ugly spot where we need to
1761 * XXX really change the sockets identity after
1762 * XXX it has entered the hashes. -DaveM
1764 * Besides that, it does not check for connection
1765 * uniqueness. Wait for troubles.
1767 __tcp_v4_rehash(sk
);
1771 int tcp_v4_rebuild_header(struct sock
*sk
)
1773 struct rtable
*rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
1777 /* Route is OK, nothing to do. */
1783 if(sk
->protinfo
.af_inet
.opt
&& sk
->protinfo
.af_inet
.opt
->srr
)
1784 daddr
= sk
->protinfo
.af_inet
.opt
->faddr
;
1786 err
= ip_route_output(&rt
, daddr
, sk
->saddr
,
1787 RT_TOS(sk
->protinfo
.af_inet
.tos
) | RTO_CONN
| sk
->localroute
,
1790 __sk_dst_set(sk
, &rt
->u
.dst
);
1791 /* sk->route_caps = rt->u.dst.dev->features; */
1795 /* Routing failed... */
1796 /* sk->route_caps = 0; */
1798 if (!sysctl_ip_dynaddr
||
1799 sk
->state
!= TCP_SYN_SENT
||
1800 (sk
->userlocks
& SOCK_BINDADDR_LOCK
) ||
1801 (err
= tcp_v4_reselect_saddr(sk
)) != 0) {
1803 /* sk->error_report(sk); */
1808 static void v4_addr2sockaddr(struct sock
*sk
, struct sockaddr
* uaddr
)
1810 struct sockaddr_in
*sin
= (struct sockaddr_in
*) uaddr
;
1812 sin
->sin_family
= AF_INET
;
1813 sin
->sin_addr
.s_addr
= sk
->daddr
;
1814 sin
->sin_port
= sk
->dport
;
1817 /* VJ's idea. Save last timestamp seen from this destination
1818 * and hold it at least for normal timewait interval to use for duplicate
1819 * segment detection in subsequent connections, before they enter synchronized
1823 int tcp_v4_remember_stamp(struct sock
*sk
)
1825 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1826 struct rtable
*rt
= (struct rtable
*)__sk_dst_get(sk
);
1827 struct inet_peer
*peer
= NULL
;
1830 if (rt
== NULL
|| rt
->rt_dst
!= sk
->daddr
) {
1831 peer
= inet_getpeer(sk
->daddr
, 1);
1834 if (rt
->peer
== NULL
)
1835 rt_bind_peer(rt
, 1);
1840 if ((s32
)(peer
->tcp_ts
- tp
->ts_recent
) <= 0 ||
1841 (peer
->tcp_ts_stamp
+ TCP_PAWS_MSL
< xtime
.tv_sec
&&
1842 peer
->tcp_ts_stamp
<= tp
->ts_recent_stamp
)) {
1843 peer
->tcp_ts_stamp
= tp
->ts_recent_stamp
;
1844 peer
->tcp_ts
= tp
->ts_recent
;
1854 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket
*tw
)
1856 struct inet_peer
*peer
= NULL
;
1858 peer
= inet_getpeer(tw
->daddr
, 1);
1861 if ((s32
)(peer
->tcp_ts
- tw
->ts_recent
) <= 0 ||
1862 (peer
->tcp_ts_stamp
+ TCP_PAWS_MSL
< xtime
.tv_sec
&&
1863 peer
->tcp_ts_stamp
<= tw
->ts_recent_stamp
)) {
1864 peer
->tcp_ts_stamp
= tw
->ts_recent_stamp
;
1865 peer
->tcp_ts
= tw
->ts_recent
;
1874 struct tcp_func ipv4_specific
= {
1877 tcp_v4_rebuild_header
,
1878 tcp_v4_conn_request
,
1879 tcp_v4_syn_recv_sock
,
1880 tcp_v4_hash_connecting
,
1881 tcp_v4_remember_stamp
,
1882 sizeof(struct iphdr
),
1887 sizeof(struct sockaddr_in
)
1890 /* NOTE: A lot of things set to zero explicitly by call to
1891 * sk_alloc() so need not be done here.
1893 static int tcp_v4_init_sock(struct sock
*sk
)
1895 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1897 skb_queue_head_init(&tp
->out_of_order_queue
);
1898 tcp_init_xmit_timers(sk
);
1899 tcp_prequeue_init(tp
);
1901 tp
->rto
= TCP_TIMEOUT_INIT
;
1902 tp
->mdev
= TCP_TIMEOUT_INIT
;
1904 /* So many TCP implementations out there (incorrectly) count the
1905 * initial SYN frame in their delayed-ACK and congestion control
1906 * algorithms that we must have the following bandaid to talk
1907 * efficiently to them. -DaveM
1911 /* See draft-stevens-tcpca-spec-01 for discussion of the
1912 * initialization of these values.
1914 tp
->snd_ssthresh
= 0x7fffffff; /* Infinity */
1915 tp
->snd_cwnd_clamp
= ~0;
1916 tp
->mss_cache
= 536;
1918 tp
->reordering
= sysctl_tcp_reordering
;
1920 sk
->state
= TCP_CLOSE
;
1922 sk
->write_space
= tcp_write_space
;
1924 sk
->tp_pinfo
.af_tcp
.af_specific
= &ipv4_specific
;
1926 sk
->sndbuf
= sysctl_tcp_wmem
[1];
1927 sk
->rcvbuf
= sysctl_tcp_rmem
[1];
1929 atomic_inc(&tcp_sockets_allocated
);
1934 static int tcp_v4_destroy_sock(struct sock
*sk
)
1936 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1938 tcp_clear_xmit_timers(sk
);
1940 /* Cleanup up the write buffer. */
1941 tcp_writequeue_purge(sk
);
1943 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1944 __skb_queue_purge(&tp
->out_of_order_queue
);
1946 /* Clean prequeue, it must be empty really */
1947 __skb_queue_purge(&tp
->ucopy
.prequeue
);
1949 /* Clean up a referenced TCP bind bucket. */
1950 if(sk
->prev
!= NULL
)
1953 atomic_dec(&tcp_sockets_allocated
);
1958 /* Proc filesystem TCP sock list dumping. */
1959 static void get_openreq(struct sock
*sk
, struct open_request
*req
, char *tmpbuf
, int i
, int uid
)
1961 int ttd
= req
->expires
- jiffies
;
1963 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
1964 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1966 req
->af
.v4_req
.loc_addr
,
1968 req
->af
.v4_req
.rmt_addr
,
1969 ntohs(req
->rmt_port
),
1971 0,0, /* could print option size, but that is af dependent. */
1972 1, /* timers active (only the expire timer) */
1976 0, /* non standard timer */
1977 0, /* open_requests have no inode */
1978 atomic_read(&sk
->refcnt
),
1983 static void get_tcp_sock(struct sock
*sp
, char *tmpbuf
, int i
)
1985 unsigned int dest
, src
;
1988 unsigned long timer_expires
;
1989 struct tcp_opt
*tp
= &sp
->tp_pinfo
.af_tcp
;
1992 src
= sp
->rcv_saddr
;
1993 destp
= ntohs(sp
->dport
);
1994 srcp
= ntohs(sp
->sport
);
1995 if (tp
->pending
== TCP_TIME_RETRANS
) {
1997 timer_expires
= tp
->timeout
;
1998 } else if (tp
->pending
== TCP_TIME_PROBE0
) {
2000 timer_expires
= tp
->timeout
;
2001 } else if (timer_pending(&sp
->timer
)) {
2003 timer_expires
= sp
->timer
.expires
;
2006 timer_expires
= jiffies
;
2009 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2010 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2011 i
, src
, srcp
, dest
, destp
, sp
->state
,
2012 tp
->write_seq
-tp
->snd_una
, tp
->rcv_nxt
-tp
->copied_seq
,
2013 timer_active
, timer_expires
-jiffies
,
2018 atomic_read(&sp
->refcnt
), sp
,
2019 tp
->rto
, tp
->ack
.ato
, (tp
->ack
.quick
<<1)|tp
->ack
.pingpong
,
2020 tp
->snd_cwnd
, tp
->snd_ssthresh
>=0xFFFF?-1:tp
->snd_ssthresh
2024 static void get_timewait_sock(struct tcp_tw_bucket
*tw
, char *tmpbuf
, int i
)
2026 unsigned int dest
, src
;
2028 int ttd
= tw
->ttd
- jiffies
;
2034 src
= tw
->rcv_saddr
;
2035 destp
= ntohs(tw
->dport
);
2036 srcp
= ntohs(tw
->sport
);
2038 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2039 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2040 i
, src
, srcp
, dest
, destp
, tw
->substate
, 0, 0,
2042 atomic_read(&tw
->refcnt
), tw
);
2047 int tcp_get_info(char *buffer
, char **start
, off_t offset
, int length
)
2049 int len
= 0, num
= 0, i
;
2050 off_t begin
, pos
= 0;
2051 char tmpbuf
[TMPSZ
+1];
2054 len
+= sprintf(buffer
, "%-*s\n", TMPSZ
-1,
2055 " sl local_address rem_address st tx_queue "
2056 "rx_queue tr tm->when retrnsmt uid timeout inode");
2060 /* First, walk listening socket table. */
2062 for(i
= 0; i
< TCP_LHTABLE_SIZE
; i
++) {
2063 struct sock
*sk
= tcp_listening_hash
[i
];
2064 struct tcp_listen_opt
*lopt
;
2067 for (sk
= tcp_listening_hash
[i
]; sk
; sk
= sk
->next
, num
++) {
2068 struct open_request
*req
;
2070 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2072 if (!TCP_INET_FAMILY(sk
->family
))
2076 if (pos
>= offset
) {
2077 get_tcp_sock(sk
, tmpbuf
, num
);
2078 len
+= sprintf(buffer
+len
, "%-*s\n", TMPSZ
-1, tmpbuf
);
2079 if (len
>= length
) {
2080 tcp_listen_unlock();
2086 uid
= sock_i_uid(sk
);
2087 read_lock_bh(&tp
->syn_wait_lock
);
2088 lopt
= tp
->listen_opt
;
2089 if (lopt
&& lopt
->qlen
!= 0) {
2090 for (k
=0; k
<TCP_SYNQ_HSIZE
; k
++) {
2091 for (req
= lopt
->syn_table
[k
]; req
; req
= req
->dl_next
, num
++) {
2092 if (!TCP_INET_FAMILY(req
->class->family
))
2098 get_openreq(sk
, req
, tmpbuf
, num
, uid
);
2099 len
+= sprintf(buffer
+len
, "%-*s\n", TMPSZ
-1, tmpbuf
);
2101 read_unlock_bh(&tp
->syn_wait_lock
);
2102 tcp_listen_unlock();
2108 read_unlock_bh(&tp
->syn_wait_lock
);
2110 /* Completed requests are in normal socket hash table */
2113 tcp_listen_unlock();
2117 /* Next, walk established hash chain. */
2118 for (i
= 0; i
< tcp_ehash_size
; i
++) {
2119 struct tcp_ehash_bucket
*head
= &tcp_ehash
[i
];
2121 struct tcp_tw_bucket
*tw
;
2123 read_lock(&head
->lock
);
2124 for(sk
= head
->chain
; sk
; sk
= sk
->next
, num
++) {
2125 if (!TCP_INET_FAMILY(sk
->family
))
2130 get_tcp_sock(sk
, tmpbuf
, num
);
2131 len
+= sprintf(buffer
+len
, "%-*s\n", TMPSZ
-1, tmpbuf
);
2133 read_unlock(&head
->lock
);
2137 for (tw
= (struct tcp_tw_bucket
*)tcp_ehash
[i
+tcp_ehash_size
].chain
;
2139 tw
= (struct tcp_tw_bucket
*)tw
->next
, num
++) {
2140 if (!TCP_INET_FAMILY(tw
->family
))
2145 get_timewait_sock(tw
, tmpbuf
, num
);
2146 len
+= sprintf(buffer
+len
, "%-*s\n", TMPSZ
-1, tmpbuf
);
2148 read_unlock(&head
->lock
);
2152 read_unlock(&head
->lock
);
2159 begin
= len
- (pos
- offset
);
2160 *start
= buffer
+ begin
;
2169 struct proto tcp_prot
= {
2172 connect
: tcp_v4_connect
,
2173 disconnect
: tcp_disconnect
,
2176 init
: tcp_v4_init_sock
,
2177 destroy
: tcp_v4_destroy_sock
,
2178 shutdown
: tcp_shutdown
,
2179 setsockopt
: tcp_setsockopt
,
2180 getsockopt
: tcp_getsockopt
,
2181 sendmsg
: tcp_sendmsg
,
2182 recvmsg
: tcp_recvmsg
,
2183 backlog_rcv
: tcp_v4_do_rcv
,
2186 get_port
: tcp_v4_get_port
,
2191 void __init
tcp_v4_init(struct net_proto_family
*ops
)
2195 tcp_inode
.i_mode
= S_IFSOCK
;
2196 tcp_inode
.i_sock
= 1;
2197 tcp_inode
.i_uid
= 0;
2198 tcp_inode
.i_gid
= 0;
2199 init_waitqueue_head(&tcp_inode
.i_wait
);
2200 init_waitqueue_head(&tcp_inode
.u
.socket_i
.wait
);
2202 tcp_socket
->inode
= &tcp_inode
;
2203 tcp_socket
->state
= SS_UNCONNECTED
;
2204 tcp_socket
->type
=SOCK_RAW
;
2206 if ((err
=ops
->create(tcp_socket
, IPPROTO_TCP
))<0)
2207 panic("Failed to create the TCP control socket.\n");
2208 tcp_socket
->sk
->allocation
=GFP_ATOMIC
;
2209 tcp_socket
->sk
->protinfo
.af_inet
.ttl
= MAXTTL
;
2211 /* Unhash it so that IP input processing does not even
2212 * see it, we do not wish this socket to see incoming
2215 tcp_socket
->sk
->prot
->unhash(tcp_socket
->sk
);