2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.189 1999/09/07 02:31:33 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
55 #include <linux/ipsec.h>
60 #include <net/inet_common.h>
62 #include <asm/segment.h>
64 #include <linux/inet.h>
65 #include <linux/stddef.h>
67 extern int sysctl_tcp_timestamps
;
68 extern int sysctl_tcp_window_scaling
;
69 extern int sysctl_tcp_sack
;
70 extern int sysctl_tcp_syncookies
;
71 extern int sysctl_tcp_tw_recycle
;
72 extern int sysctl_ip_dynaddr
;
73 extern __u32 sysctl_wmem_max
;
74 extern __u32 sysctl_rmem_max
;
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
79 /* Socket used for sending RSTs */
80 struct inode tcp_inode
;
81 struct socket
*tcp_socket
=&tcp_inode
.u
.socket_i
;
83 static void tcp_v4_send_reset(struct sk_buff
*skb
);
85 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
88 /* This is for sockets with full identity only. Sockets here will always
89 * be without wildcards and will have the following invariant:
90 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
92 * First half of the table is for sockets not in TIME_WAIT, second half
93 * is for TIME_WAIT sockets only.
95 struct tcp_ehash_bucket
*tcp_ehash
= NULL
;
97 /* Ok, let's try this, I give up, we do need a local binding
98 * TCP hash as well as the others for fast bind/connect.
100 struct tcp_bind_hashbucket
*tcp_bhash
= NULL
;
102 int tcp_bhash_size
= 0;
103 int tcp_ehash_size
= 0;
105 /* All sockets in TCP_LISTEN state will be in here. This is the only table
106 * where wildcard'd TCP sockets can exist. Hash function here is just local
109 struct sock
*tcp_listening_hash
[TCP_LHTABLE_SIZE
] = { NULL
, };
110 char __tcp_clean_cacheline_pad
[(SMP_CACHE_BYTES
-
111 (((sizeof(void *) * (TCP_LHTABLE_SIZE
+ 2)) +
112 (sizeof(int) * 2)) % SMP_CACHE_BYTES
))] = { 0, };
114 rwlock_t tcp_lhash_lock
= RW_LOCK_UNLOCKED
;
115 atomic_t tcp_lhash_users
= ATOMIC_INIT(0);
116 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait
);
118 spinlock_t tcp_portalloc_lock
= SPIN_LOCK_UNLOCKED
;
121 * This array holds the first and last local port number.
122 * For high-usage systems, use sysctl to change this to
125 int sysctl_local_port_range
[2] = { 1024, 4999 };
126 int tcp_port_rover
= (1024 - 1);
128 static __inline__
int tcp_hashfn(__u32 laddr
, __u16 lport
,
129 __u32 faddr
, __u16 fport
)
131 int h
= ((laddr
^ lport
) ^ (faddr
^ fport
));
134 return h
& (tcp_ehash_size
- 1);
137 static __inline__
int tcp_sk_hashfn(struct sock
*sk
)
139 __u32 laddr
= sk
->rcv_saddr
;
140 __u16 lport
= sk
->num
;
141 __u32 faddr
= sk
->daddr
;
142 __u16 fport
= sk
->dport
;
144 return tcp_hashfn(laddr
, lport
, faddr
, fport
);
147 /* Allocate and initialize a new TCP local port bind bucket.
148 * The bindhash mutex for snum's hash chain must be held here.
150 struct tcp_bind_bucket
*tcp_bucket_create(struct tcp_bind_hashbucket
*head
,
153 struct tcp_bind_bucket
*tb
;
155 tb
= kmem_cache_alloc(tcp_bucket_cachep
, SLAB_ATOMIC
);
160 if((tb
->next
= head
->chain
) != NULL
)
161 tb
->next
->pprev
= &tb
->next
;
163 tb
->pprev
= &head
->chain
;
168 /* Caller must disable local BH processing. */
169 static __inline__
void __tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
171 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(child
->num
)];
172 struct tcp_bind_bucket
*tb
;
174 spin_lock(&head
->lock
);
175 tb
= (struct tcp_bind_bucket
*)sk
->prev
;
176 if ((child
->bind_next
= tb
->owners
) != NULL
)
177 tb
->owners
->bind_pprev
= &child
->bind_next
;
179 child
->bind_pprev
= &tb
->owners
;
180 child
->prev
= (struct sock
*) tb
;
181 spin_unlock(&head
->lock
);
184 __inline__
void tcp_inherit_port(struct sock
*sk
, struct sock
*child
)
187 __tcp_inherit_port(sk
, child
);
191 /* Obtain a reference to a local port for the given sock,
192 * if snum is zero it means select any available local port.
194 static int tcp_v4_get_port(struct sock
*sk
, unsigned short snum
)
196 struct tcp_bind_hashbucket
*head
;
197 struct tcp_bind_bucket
*tb
;
202 int low
= sysctl_local_port_range
[0];
203 int high
= sysctl_local_port_range
[1];
204 int remaining
= (high
- low
) + 1;
207 spin_lock(&tcp_portalloc_lock
);
208 rover
= tcp_port_rover
;
210 if ((rover
< low
) || (rover
> high
))
212 head
= &tcp_bhash
[tcp_bhashfn(rover
)];
213 spin_lock(&head
->lock
);
214 for (tb
= head
->chain
; tb
; tb
= tb
->next
)
215 if (tb
->port
== rover
)
219 spin_unlock(&head
->lock
);
220 } while (--remaining
> 0);
221 tcp_port_rover
= rover
;
222 spin_unlock(&tcp_portalloc_lock
);
224 /* Exhausted local port range during search? */
229 /* OK, here is the one we will use. HEAD is
230 * non-NULL and we hold it's mutex.
235 head
= &tcp_bhash
[tcp_bhashfn(snum
)];
236 spin_lock(&head
->lock
);
237 for (tb
= head
->chain
; tb
!= NULL
; tb
= tb
->next
)
238 if (tb
->port
== snum
)
241 if (tb
!= NULL
&& tb
->owners
!= NULL
) {
242 if (tb
->fastreuse
!= 0 && sk
->reuse
!= 0) {
245 struct sock
*sk2
= tb
->owners
;
246 int sk_reuse
= sk
->reuse
;
248 for( ; sk2
!= NULL
; sk2
= sk2
->bind_next
) {
249 if (sk
->bound_dev_if
== sk2
->bound_dev_if
) {
252 sk2
->state
== TCP_LISTEN
) {
253 if (!sk2
->rcv_saddr
||
255 (sk2
->rcv_saddr
== sk
->rcv_saddr
))
260 /* If we found a conflict, fail. */
268 (tb
= tcp_bucket_create(head
, snum
)) == NULL
)
270 if (tb
->owners
== NULL
) {
271 if (sk
->reuse
&& sk
->state
!= TCP_LISTEN
)
275 } else if (tb
->fastreuse
&&
276 ((sk
->reuse
== 0) || (sk
->state
== TCP_LISTEN
)))
280 if ((sk
->bind_next
= tb
->owners
) != NULL
)
281 tb
->owners
->bind_pprev
= &sk
->bind_next
;
283 sk
->bind_pprev
= &tb
->owners
;
284 sk
->prev
= (struct sock
*) tb
;
288 spin_unlock(&head
->lock
);
294 /* Get rid of any references to a local port held by the
297 __inline__
void __tcp_put_port(struct sock
*sk
)
299 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(sk
->num
)];
300 struct tcp_bind_bucket
*tb
;
302 spin_lock(&head
->lock
);
303 tb
= (struct tcp_bind_bucket
*) sk
->prev
;
305 sk
->bind_next
->bind_pprev
= sk
->bind_pprev
;
306 *(sk
->bind_pprev
) = sk
->bind_next
;
308 if (tb
->owners
== NULL
) {
310 tb
->next
->pprev
= tb
->pprev
;
311 *(tb
->pprev
) = tb
->next
;
312 kmem_cache_free(tcp_bucket_cachep
, tb
);
314 spin_unlock(&head
->lock
);
317 void tcp_put_port(struct sock
*sk
)
324 #ifdef CONFIG_TCP_TW_RECYCLE
326 Very stupid pseudo-"algoritm". If the approach will be successful
327 (and it will!), we have to make it more reasonable.
328 Now it eats lots of CPU, when we are tough on ports.
330 Apparently, it should be hash table indexed by daddr/dport.
332 How does it work? We allow to truncate time-wait state, if:
334 2. timewait bucket did not receive data for timeout:
335 - initially timeout := 2*RTO, so that if our ACK to first
336 transmitted peer's FIN is lost, we will see first retransmit.
337 - if we receive anything, the timout is increased exponentially
338 to follow normal TCP backoff pattern.
339 It is important that minimal RTO (HZ/5) > minimal timestamp
341 3. When creating new socket, we inherit sequence number
342 and ts_recent of time-wait bucket, increasinf them a bit.
344 These two conditions guarantee, that data will not be corrupted
345 both by retransmitted and by delayed segments. They do not guarantee
346 that peer will leave LAST-ACK/CLOSING state gracefully, it will be
347 reset sometimes, namely, when more than two our ACKs to its FINs are lost.
348 This reset is harmless and even good.
351 int tcp_v4_tw_recycle(struct sock
*sk
, u32 daddr
, u16 dport
)
355 struct tcp_tw_bucket
*tw
;
356 struct tcp_bind_hashbucket
*head
;
357 struct tcp_bind_bucket
*tb
;
359 int low
= sysctl_local_port_range
[0];
360 int high
= sysctl_local_port_range
[1];
361 unsigned long now
= jiffies
;
367 for (i
=0; i
<tcp_bhash_size
; i
++, rover
++) {
368 rover
&= (tcp_bhash_size
-1);
369 head
= &tcp_bhash
[rover
];
371 spin_lock(&head
->lock
);
372 for (tb
= head
->chain
; tb
; tb
= tb
->next
) {
373 tw
= (struct tcp_tw_bucket
*)tb
->owners
;
375 if (tw
->state
!= TCP_TIME_WAIT
||
376 tw
->dport
!= dport
||
377 tw
->daddr
!= daddr
||
378 tw
->rcv_saddr
!= sk
->rcv_saddr
||
381 !TCP_INET_FAMILY(tw
->family
) ||
382 tw
->ts_recent_stamp
== 0 ||
383 (long)(now
- tw
->ttd
) <= 0)
388 spin_unlock(&head
->lock
);
396 if ((sk
->bind_next
= tb
->owners
) != NULL
)
397 tb
->owners
->bind_pprev
= &sk
->bind_next
;
399 sk
->bind_pprev
= &tb
->owners
;
400 sk
->prev
= (struct sock
*) tb
;
401 spin_unlock_bh(&head
->lock
);
407 void tcp_listen_wlock(void)
409 write_lock(&tcp_lhash_lock
);
411 if (atomic_read(&tcp_lhash_users
)) {
412 DECLARE_WAITQUEUE(wait
, current
);
414 add_wait_queue(&tcp_lhash_wait
, &wait
);
416 set_current_state(TASK_UNINTERRUPTIBLE
);
417 if (atomic_read(&tcp_lhash_users
) == 0)
419 write_unlock_bh(&tcp_lhash_lock
);
421 write_lock_bh(&tcp_lhash_lock
);
424 __set_current_state(TASK_RUNNING
);
425 remove_wait_queue(&tcp_lhash_wait
, &wait
);
429 static __inline__
void __tcp_v4_hash(struct sock
*sk
)
434 BUG_TRAP(sk
->pprev
==NULL
);
435 if(sk
->state
== TCP_LISTEN
) {
436 skp
= &tcp_listening_hash
[tcp_sk_listen_hashfn(sk
)];
437 lock
= &tcp_lhash_lock
;
440 skp
= &tcp_ehash
[(sk
->hashent
= tcp_sk_hashfn(sk
))].chain
;
441 lock
= &tcp_ehash
[sk
->hashent
].lock
;
444 if((sk
->next
= *skp
) != NULL
)
445 (*skp
)->pprev
= &sk
->next
;
449 if(sk
->prot
->highestinuse
< sk
->prot
->inuse
)
450 sk
->prot
->highestinuse
= sk
->prot
->inuse
;
454 static void tcp_v4_hash(struct sock
*sk
)
456 if (sk
->state
!= TCP_CLOSE
) {
463 void tcp_unhash(struct sock
*sk
)
467 if (sk
->state
== TCP_LISTEN
) {
470 lock
= &tcp_lhash_lock
;
472 struct tcp_ehash_bucket
*head
= &tcp_ehash
[sk
->hashent
];
474 write_lock_bh(&head
->lock
);
479 sk
->next
->pprev
= sk
->pprev
;
480 *sk
->pprev
= sk
->next
;
484 write_unlock_bh(lock
);
487 /* Don't inline this cruft. Here are some nice properties to
488 * exploit here. The BSD API does not allow a listening TCP
489 * to specify the remote port nor the remote address for the
490 * connection. So always assume those are both wildcarded
491 * during the search since they can never be otherwise.
493 static struct sock
*__tcp_v4_lookup_listener(struct sock
*sk
, u32 daddr
, unsigned short hnum
, int dif
)
495 struct sock
*result
= NULL
;
499 for(; sk
; sk
= sk
->next
) {
500 if(sk
->num
== hnum
) {
501 __u32 rcv_saddr
= sk
->rcv_saddr
;
505 if (rcv_saddr
!= daddr
)
509 if (sk
->bound_dev_if
) {
510 if (sk
->bound_dev_if
!= dif
)
516 if (score
> hiscore
) {
525 /* Optimize the common listener case. */
526 __inline__
struct sock
*tcp_v4_lookup_listener(u32 daddr
, unsigned short hnum
, int dif
)
530 read_lock(&tcp_lhash_lock
);
531 sk
= tcp_listening_hash
[tcp_lhashfn(hnum
)];
533 if (sk
->num
== hnum
&& sk
->next
== NULL
)
535 sk
= __tcp_v4_lookup_listener(sk
, daddr
, hnum
, dif
);
541 read_unlock(&tcp_lhash_lock
);
545 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
546 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
548 * Local BH must be disabled here.
550 static inline struct sock
*__tcp_v4_lookup(u32 saddr
, u16 sport
,
551 u32 daddr
, u16 hnum
, int dif
)
553 struct tcp_ehash_bucket
*head
;
554 TCP_V4_ADDR_COOKIE(acookie
, saddr
, daddr
)
555 __u32 ports
= TCP_COMBINED_PORTS(sport
, hnum
);
559 /* Optimize here for direct hit, only listening connections can
560 * have wildcards anyways.
562 hash
= tcp_hashfn(daddr
, hnum
, saddr
, sport
);
563 head
= &tcp_ehash
[hash
];
564 read_lock(&head
->lock
);
565 for(sk
= head
->chain
; sk
; sk
= sk
->next
) {
566 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
567 goto hit
; /* You sunk my battleship! */
570 /* Must check for a TIME_WAIT'er before going to listener hash. */
571 for(sk
= (head
+ tcp_ehash_size
)->chain
; sk
; sk
= sk
->next
)
572 if(TCP_IPV4_MATCH(sk
, acookie
, saddr
, daddr
, ports
, dif
))
574 read_unlock(&head
->lock
);
576 return tcp_v4_lookup_listener(daddr
, hnum
, dif
);
580 read_unlock(&head
->lock
);
584 __inline__
struct sock
*tcp_v4_lookup(u32 saddr
, u16 sport
, u32 daddr
, u16 dport
, int dif
)
589 sk
= __tcp_v4_lookup(saddr
, sport
, daddr
, ntohs(dport
), dif
);
595 static inline __u32
tcp_v4_init_sequence(struct sock
*sk
, struct sk_buff
*skb
)
597 return secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
602 static int tcp_v4_check_established(struct sock
*sk
)
604 u32 daddr
= sk
->rcv_saddr
;
605 u32 saddr
= sk
->daddr
;
606 int dif
= sk
->bound_dev_if
;
607 TCP_V4_ADDR_COOKIE(acookie
, saddr
, daddr
)
608 __u32 ports
= TCP_COMBINED_PORTS(sk
->dport
, sk
->num
);
609 int hash
= tcp_hashfn(daddr
, sk
->num
, saddr
, sk
->dport
);
610 struct tcp_ehash_bucket
*head
= &tcp_ehash
[hash
];
611 struct sock
*sk2
, **skp
;
612 #ifdef CONFIG_TCP_TW_RECYCLE
613 struct tcp_tw_bucket
*tw
;
616 write_lock_bh(&head
->lock
);
618 /* Check TIME-WAIT sockets first. */
619 for(skp
= &(head
+ tcp_ehash_size
)->chain
; (sk2
=*skp
) != NULL
;
621 #ifdef CONFIG_TCP_TW_RECYCLE
622 tw
= (struct tcp_tw_bucket
*)sk2
;
625 if(TCP_IPV4_MATCH(sk2
, acookie
, saddr
, daddr
, ports
, dif
)) {
626 #ifdef CONFIG_TCP_TW_RECYCLE
627 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
629 /* With PAWS, it is safe from the viewpoint
630 of data integrity. Even without PAWS it
631 is safe provided sequence spaces do not
632 overlap i.e. at data rates <= 80Mbit/sec.
634 Actually, the idea is close to VJ's (rfc1332)
635 one, only timestamp cache is held not per host,
636 but per port pair and TW bucket is used
639 if (sysctl_tcp_tw_recycle
&& tw
->ts_recent_stamp
) {
640 if ((tp
->write_seq
= tw
->snd_nxt
+ 2) == 0)
642 tp
->ts_recent
= tw
->ts_recent
;
643 tp
->ts_recent_stamp
= tw
->ts_recent_stamp
;
652 #ifdef CONFIG_TCP_TW_RECYCLE
656 /* And established part... */
657 for(skp
= &head
->chain
; (sk2
=*skp
)!=NULL
; skp
= &sk2
->next
) {
658 if(TCP_IPV4_MATCH(sk2
, acookie
, saddr
, daddr
, ports
, dif
))
662 #ifdef CONFIG_TCP_TW_RECYCLE
665 BUG_TRAP(sk
->pprev
==NULL
);
666 if ((sk
->next
= *skp
) != NULL
)
667 (*skp
)->pprev
= &sk
->next
;
672 if(sk
->prot
->highestinuse
< sk
->prot
->inuse
)
673 sk
->prot
->highestinuse
= sk
->prot
->inuse
;
674 write_unlock_bh(&head
->lock
);
676 #ifdef CONFIG_TCP_TW_RECYCLE
678 /* Silly. Should hash-dance instead... */
680 tcp_tw_deschedule(tw
);
681 tcp_timewait_kill(tw
);
690 write_unlock_bh(&head
->lock
);
691 return -EADDRNOTAVAIL
;
694 /* Hash SYN-SENT socket to established hash table after
695 * checking that it is unique. Note, that without kernel lock
696 * we MUST make these two operations atomically.
698 * Optimization: if it is bound and tcp_bind_bucket has the only
699 * owner (us), we need not to scan established bucket.
702 int tcp_v4_hash_connecting(struct sock
*sk
)
704 unsigned short snum
= sk
->num
;
705 struct tcp_bind_hashbucket
*head
= &tcp_bhash
[tcp_bhashfn(snum
)];
706 struct tcp_bind_bucket
*tb
= (struct tcp_bind_bucket
*)sk
->prev
;
708 spin_lock_bh(&head
->lock
);
709 if (tb
->owners
== sk
&& sk
->bind_next
== NULL
) {
711 spin_unlock_bh(&head
->lock
);
714 spin_unlock_bh(&head
->lock
);
716 /* No definite answer... Walk to established hash table */
717 return tcp_v4_check_established(sk
);
721 /* This will initiate an outgoing connection. */
722 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
724 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
725 struct sockaddr_in
*usin
= (struct sockaddr_in
*) uaddr
;
726 struct sk_buff
*buff
;
732 if (sk
->state
!= TCP_CLOSE
)
735 if (addr_len
< sizeof(struct sockaddr_in
))
738 if (usin
->sin_family
!= AF_INET
)
739 return(-EAFNOSUPPORT
);
741 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
742 if (sk
->protinfo
.af_inet
.opt
&& sk
->protinfo
.af_inet
.opt
->srr
) {
745 nexthop
= sk
->protinfo
.af_inet
.opt
->faddr
;
748 tmp
= ip_route_connect(&rt
, nexthop
, sk
->saddr
,
749 RT_TOS(sk
->protinfo
.af_inet
.tos
)|RTO_CONN
|sk
->localroute
, sk
->bound_dev_if
);
753 if (rt
->rt_flags
&(RTCF_MULTICAST
|RTCF_BROADCAST
)) {
758 __sk_dst_set(sk
, &rt
->u
.dst
);
760 if (!sk
->protinfo
.af_inet
.opt
|| !sk
->protinfo
.af_inet
.opt
->srr
)
764 buff
= sock_wmalloc(sk
, (MAX_HEADER
+ sk
->prot
->max_header
),
771 sk
->saddr
= rt
->rt_src
;
772 sk
->rcv_saddr
= sk
->saddr
;
775 if (sk
->prot
->get_port(sk
, 0)
776 #ifdef CONFIG_TCP_TW_RECYCLE
777 && (!sysctl_tcp_tw_recycle
||
778 tcp_v4_tw_recycle(sk
, daddr
, usin
->sin_port
))
785 sk
->sport
= htons(sk
->num
);
787 #ifdef CONFIG_TCP_TW_RECYCLE
788 else if (tp
->ts_recent_stamp
&& sk
->daddr
!= daddr
) {
789 /* Reset inherited state */
791 tp
->ts_recent_stamp
= 0;
796 sk
->dport
= usin
->sin_port
;
800 tp
->write_seq
= secure_tcp_sequence_number(sk
->saddr
, sk
->daddr
,
801 sk
->sport
, usin
->sin_port
);
803 tp
->ext_header_len
= 0;
804 if (sk
->protinfo
.af_inet
.opt
)
805 tp
->ext_header_len
= sk
->protinfo
.af_inet
.opt
->optlen
;
809 err
= tcp_connect(sk
, buff
);
819 static int tcp_v4_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
821 int retval
= -EINVAL
;
825 /* Do sanity checking for sendmsg/sendto/send. */
826 if (msg
->msg_flags
& ~(MSG_OOB
|MSG_DONTROUTE
|MSG_DONTWAIT
|MSG_NOSIGNAL
))
829 struct sockaddr_in
*addr
=(struct sockaddr_in
*)msg
->msg_name
;
831 if (msg
->msg_namelen
< sizeof(*addr
))
833 if (addr
->sin_family
&& addr
->sin_family
!= AF_INET
)
836 if(sk
->state
== TCP_CLOSE
)
839 if (addr
->sin_port
!= sk
->dport
)
841 if (addr
->sin_addr
.s_addr
!= sk
->daddr
)
844 retval
= tcp_do_sendmsg(sk
, msg
);
853 * Do a linear search in the socket open_request list.
854 * This should be replaced with a global hash table.
856 static struct open_request
*tcp_v4_search_req(struct tcp_opt
*tp
,
859 struct open_request
**prevp
)
861 struct open_request
*req
, *prev
;
862 __u16 rport
= th
->source
;
864 /* assumption: the socket is not in use.
865 * as we checked the user count on tcp_rcv and we're
866 * running from a soft interrupt.
868 prev
= (struct open_request
*) (&tp
->syn_wait_queue
);
869 for (req
= prev
->dl_next
; req
; req
= req
->dl_next
) {
870 if (req
->af
.v4_req
.rmt_addr
== iph
->saddr
&&
871 req
->af
.v4_req
.loc_addr
== iph
->daddr
&&
872 req
->rmt_port
== rport
&&
873 TCP_INET_FAMILY(req
->class->family
)) {
875 /* Weird case: connection was established
876 and then killed by RST before user accepted
877 it. This connection is dead, but we cannot
878 kill openreq to avoid blocking in accept().
880 accept() will collect this garbage,
881 but such reqs must be ignored, when talking
884 bh_lock_sock(req
->sk
);
885 BUG_TRAP(req
->sk
->lock
.users
==0);
886 if (req
->sk
->state
== TCP_CLOSE
) {
887 bh_unlock_sock(req
->sk
);
902 * This routine does path mtu discovery as defined in RFC1191.
904 static inline void do_pmtu_discovery(struct sock
*sk
, struct iphdr
*ip
, unsigned mtu
)
906 struct dst_entry
*dst
;
907 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
909 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
910 * send out by Linux are always <576bytes so they should go through
913 if (sk
->state
== TCP_LISTEN
)
916 /* We don't check in the destentry if pmtu discovery is forbidden
917 * on this route. We just assume that no packet_to_big packets
918 * are send back when pmtu discovery is not active.
919 * There is a small race when the user changes this flag in the
920 * route, but I think that's acceptable.
922 if ((dst
= __sk_dst_check(sk
, 0)) == NULL
)
925 ip_rt_update_pmtu(dst
, mtu
);
927 /* Something is about to be wrong... Remember soft error
928 * for the case, if this connection will not able to recover.
930 if (mtu
< dst
->pmtu
&& ip_dont_fragment(sk
, dst
))
931 sk
->err_soft
= EMSGSIZE
;
933 if (sk
->protinfo
.af_inet
.pmtudisc
!= IP_PMTUDISC_DONT
&&
934 tp
->pmtu_cookie
> dst
->pmtu
) {
935 tcp_sync_mss(sk
, dst
->pmtu
);
937 /* Resend the TCP packet because it's
938 * clear that the old packet has been
939 * dropped. This is the new "fast" path mtu
942 tcp_simple_retransmit(sk
);
943 } /* else let the usual retransmit timer handle it */
947 * This routine is called by the ICMP module when it gets some
948 * sort of error condition. If err < 0 then the socket should
949 * be closed and the error returned to the user. If err > 0
950 * it's just the icmp type << 8 | icmp code. After adjustment
951 * header points to the first 8 bytes of the tcp header. We need
952 * to find the appropriate port.
954 * The locking strategy used here is very "optimistic". When
955 * someone else accesses the socket the ICMP is just dropped
956 * and for some paths there is no check at all.
957 * A more general error queue to queue errors for later handling
958 * is probably better.
962 void tcp_v4_err(struct sk_buff
*skb
, unsigned char *dp
, int len
)
964 struct iphdr
*iph
= (struct iphdr
*)dp
;
967 int type
= skb
->h
.icmph
->type
;
968 int code
= skb
->h
.icmph
->code
;
969 #if ICMP_MIN_LENGTH < 14
978 if (len
< (iph
->ihl
<< 2) + ICMP_MIN_LENGTH
) {
979 icmp_statistics
.IcmpInErrors
++;
982 #if ICMP_MIN_LENGTH < 14
983 if (len
< (iph
->ihl
<< 2) + 14)
987 th
= (struct tcphdr
*)(dp
+(iph
->ihl
<<2));
989 sk
= tcp_v4_lookup(iph
->daddr
, th
->dest
, iph
->saddr
, th
->source
, skb
->dev
->ifindex
);
991 icmp_statistics
.IcmpInErrors
++;
994 if (sk
->state
== TCP_TIME_WAIT
) {
995 tcp_tw_put((struct tcp_tw_bucket
*)sk
);
1000 /* If too many ICMPs get dropped on busy
1001 * servers this needs to be solved differently.
1003 if (sk
->lock
.users
!= 0)
1004 net_statistics
.LockDroppedIcmps
++;
1006 tp
= &sk
->tp_pinfo
.af_tcp
;
1007 seq
= ntohl(th
->seq
);
1008 if (sk
->state
!= TCP_LISTEN
&& !between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
1009 net_statistics
.OutOfWindowIcmps
++;
1014 case ICMP_SOURCE_QUENCH
:
1015 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
1016 if (sk
->lock
.users
== 0) {
1017 tp
->snd_ssthresh
= tcp_recalc_ssthresh(tp
);
1018 tp
->snd_cwnd
= tp
->snd_ssthresh
;
1019 tp
->snd_cwnd_cnt
= 0;
1020 tp
->high_seq
= tp
->snd_nxt
;
1024 case ICMP_PARAMETERPROB
:
1027 case ICMP_DEST_UNREACH
:
1028 if (code
> NR_ICMP_UNREACH
)
1031 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
1032 if (sk
->lock
.users
== 0)
1033 do_pmtu_discovery(sk
, iph
, ntohs(skb
->h
.icmph
->un
.frag
.mtu
));
1037 err
= icmp_err_convert
[code
].errno
;
1039 case ICMP_TIME_EXCEEDED
:
1046 switch (sk
->state
) {
1047 struct open_request
*req
, *prev
;
1049 if (sk
->lock
.users
!= 0)
1052 /* The final ACK of the handshake should be already
1053 * handled in the new socket context, not here.
1054 * Strictly speaking - an ICMP error for the final
1055 * ACK should set the opening flag, but that is too
1056 * complicated right now.
1058 if (!no_flags
&& !th
->syn
&& !th
->ack
)
1061 req
= tcp_v4_search_req(tp
, iph
, th
, &prev
);
1066 struct sock
*nsk
= req
->sk
;
1069 * Already in ESTABLISHED and a big socket is created,
1070 * set error code there.
1071 * The error will _not_ be reported in the accept(),
1072 * but only with the next operation on the socket after
1080 BUG_TRAP(sk
->lock
.users
== 0);
1081 tp
= &sk
->tp_pinfo
.af_tcp
;
1082 if (!between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
1083 net_statistics
.OutOfWindowIcmps
++;
1087 if (seq
!= req
->snt_isn
) {
1088 net_statistics
.OutOfWindowIcmps
++;
1093 * Still in SYN_RECV, just remove it silently.
1094 * There is no good way to pass the error to the newly
1095 * created socket, and POSIX does not want network
1096 * errors returned from accept().
1099 tcp_synq_unlink(tp
, req
, prev
);
1100 tcp_dec_slow_timer(TCP_SLT_SYNACK
);
1101 req
->class->destructor(req
);
1102 tcp_openreq_free(req
);
1107 case TCP_SYN_RECV
: /* Cannot happen.
1108 It can f.e. if SYNs crossed.
1110 if (!no_flags
&& !th
->syn
)
1112 if (sk
->lock
.users
== 0) {
1113 tcp_statistics
.TcpAttemptFails
++;
1115 /* Wake people up to see the error (see connect in sock.c) */
1116 sk
->error_report(sk
);
1118 tcp_set_state(sk
, TCP_CLOSE
);
1126 /* If we've already connected we will keep trying
1127 * until we time out, or the user gives up.
1129 * rfc1122 4.2.3.9 allows to consider as hard errors
1130 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1131 * but it is obsoleted by pmtu discovery).
1133 * Note, that in modern internet, where routing is unreliable
1134 * and in each dark corner broken firewalls sit, sending random
1135 * errors ordered by their masters even this two messages finally lose
1136 * their original sense (even Linux sends invalid PORT_UNREACHs)
1138 * Now we are in compliance with RFCs.
1142 if (sk
->lock
.users
== 0 && sk
->protinfo
.af_inet
.recverr
) {
1144 sk
->error_report(sk
);
1145 } else { /* Only an error on timeout */
1154 /* This routine computes an IPv4 TCP checksum. */
1155 void tcp_v4_send_check(struct sock
*sk
, struct tcphdr
*th
, int len
,
1156 struct sk_buff
*skb
)
1159 th
->check
= tcp_v4_check(th
, len
, sk
->saddr
, sk
->daddr
,
1160 csum_partial((char *)th
, th
->doff
<<2, skb
->csum
));
1164 * This routine will send an RST to the other tcp.
1166 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1168 * Answer: if a packet caused RST, it is not for a socket
1169 * existing in our system, if it is matched to a socket,
1170 * it is just duplicate segment or bug in other side's TCP.
1171 * So that we build reply only basing on parameters
1172 * arrived with segment.
1173 * Exception: precedence violation. We do not implement it in any case.
1176 static void tcp_v4_send_reset(struct sk_buff
*skb
)
1178 struct tcphdr
*th
= skb
->h
.th
;
1180 struct ip_reply_arg arg
;
1182 /* Never send a reset in response to a reset. */
1186 if (((struct rtable
*)skb
->dst
)->rt_type
!= RTN_LOCAL
)
1189 /* Swap the send and the receive. */
1190 memset(&rth
, 0, sizeof(struct tcphdr
));
1191 rth
.dest
= th
->source
;
1192 rth
.source
= th
->dest
;
1193 rth
.doff
= sizeof(struct tcphdr
)/4;
1197 rth
.seq
= th
->ack_seq
;
1200 rth
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
1201 + skb
->len
- (th
->doff
<<2));
1204 memset(&arg
, 0, sizeof arg
);
1205 arg
.iov
[0].iov_base
= (unsigned char *)&rth
;
1206 arg
.iov
[0].iov_len
= sizeof rth
;
1207 arg
.csum
= csum_tcpudp_nofold(skb
->nh
.iph
->daddr
,
1208 skb
->nh
.iph
->saddr
, /*XXX*/
1209 sizeof(struct tcphdr
),
1213 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
1215 ip_send_reply(tcp_socket
->sk
, skb
, &arg
, sizeof rth
);
1217 tcp_statistics
.TcpOutSegs
++;
1218 tcp_statistics
.TcpOutRsts
++;
1221 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1222 outside socket context is ugly, certainly. What can I do?
1225 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
, u32 win
, u32 ts
)
1227 struct tcphdr
*th
= skb
->h
.th
;
1232 struct ip_reply_arg arg
;
1234 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
1235 memset(&arg
, 0, sizeof arg
);
1237 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
1238 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
1241 rep
.tsopt
[0] = __constant_htonl((TCPOPT_NOP
<< 24) |
1242 (TCPOPT_NOP
<< 16) |
1243 (TCPOPT_TIMESTAMP
<< 8) |
1245 rep
.tsopt
[1] = htonl(tcp_time_stamp
);
1246 rep
.tsopt
[2] = htonl(ts
);
1247 arg
.iov
[0].iov_len
= sizeof(rep
);
1250 /* Swap the send and the receive. */
1251 rep
.th
.dest
= th
->source
;
1252 rep
.th
.source
= th
->dest
;
1253 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
1254 rep
.th
.seq
= htonl(seq
);
1255 rep
.th
.ack_seq
= htonl(ack
);
1257 rep
.th
.window
= htons(win
);
1259 arg
.csum
= csum_tcpudp_nofold(skb
->nh
.iph
->daddr
,
1260 skb
->nh
.iph
->saddr
, /*XXX*/
1264 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
1266 ip_send_reply(tcp_socket
->sk
, skb
, &arg
, arg
.iov
[0].iov_len
);
1268 tcp_statistics
.TcpOutSegs
++;
1271 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
1273 struct tcp_tw_bucket
*tw
= (struct tcp_tw_bucket
*)sk
;
1275 tcp_v4_send_ack(skb
, tw
->snd_nxt
, tw
->rcv_nxt
, 0, tw
->ts_recent
);
1280 static void tcp_v4_or_send_ack(struct sk_buff
*skb
, struct open_request
*req
)
1282 tcp_v4_send_ack(skb
, req
->snt_isn
+1, req
->rcv_isn
+1, req
->rcv_wnd
, req
->ts_recent
);
1286 * Send a SYN-ACK after having received an ACK.
1287 * This still operates on a open_request only, not on a big
1290 static void tcp_v4_send_synack(struct sock
*sk
, struct open_request
*req
)
1293 struct ip_options
*opt
;
1294 struct sk_buff
* skb
;
1296 /* First, grab a route. */
1297 opt
= req
->af
.v4_req
.opt
;
1298 if(ip_route_output(&rt
, ((opt
&& opt
->srr
) ?
1300 req
->af
.v4_req
.rmt_addr
),
1301 req
->af
.v4_req
.loc_addr
,
1302 RT_TOS(sk
->protinfo
.af_inet
.tos
) | RTO_CONN
| sk
->localroute
,
1303 sk
->bound_dev_if
)) {
1304 ip_statistics
.IpOutNoRoutes
++;
1307 if(opt
&& opt
->is_strictroute
&& rt
->rt_dst
!= rt
->rt_gateway
) {
1309 ip_statistics
.IpOutNoRoutes
++;
1313 skb
= tcp_make_synack(sk
, &rt
->u
.dst
, req
);
1316 struct tcphdr
*th
= skb
->h
.th
;
1318 th
->check
= tcp_v4_check(th
, skb
->len
,
1319 req
->af
.v4_req
.loc_addr
, req
->af
.v4_req
.rmt_addr
,
1320 csum_partial((char *)th
, skb
->len
, skb
->csum
));
1322 ip_build_and_send_pkt(skb
, sk
, req
->af
.v4_req
.loc_addr
,
1323 req
->af
.v4_req
.rmt_addr
, req
->af
.v4_req
.opt
);
1329 * IPv4 open_request destructor.
1331 static void tcp_v4_or_free(struct open_request
*req
)
1333 if(!req
->sk
&& req
->af
.v4_req
.opt
)
1334 kfree_s(req
->af
.v4_req
.opt
, optlength(req
->af
.v4_req
.opt
));
1337 static inline void syn_flood_warning(struct sk_buff
*skb
)
1339 static unsigned long warntime
;
1341 if (jiffies
- warntime
> HZ
*60) {
1344 "possible SYN flooding on port %d. Sending cookies.\n",
1345 ntohs(skb
->h
.th
->dest
));
1350 * Save and compile IPv4 options into the open_request if needed.
1352 static inline struct ip_options
*
1353 tcp_v4_save_options(struct sock
*sk
, struct sk_buff
*skb
)
1355 struct ip_options
*opt
= &(IPCB(skb
)->opt
);
1356 struct ip_options
*dopt
= NULL
;
1358 if (opt
&& opt
->optlen
) {
1359 int opt_size
= optlength(opt
);
1360 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
1362 if (ip_options_echo(dopt
, skb
)) {
1363 kfree_s(dopt
, opt_size
);
1372 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374 * It would be better to replace it with a global counter for all sockets
1375 * but then some measure against one socket starving all other sockets
1378 int sysctl_max_syn_backlog
= 128;
1380 struct or_calltable or_ipv4
= {
1388 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1389 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1391 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1394 struct open_request
*req
;
1395 struct tcphdr
*th
= skb
->h
.th
;
1396 __u32 saddr
= skb
->nh
.iph
->saddr
;
1397 __u32 daddr
= skb
->nh
.iph
->daddr
;
1398 __u32 isn
= TCP_SKB_CB(skb
)->when
;
1399 #ifdef CONFIG_SYN_COOKIES
1400 int want_cookie
= 0;
1402 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1405 /* Never answer to SYNs send to broadcast or multicast */
1406 if (((struct rtable
*)skb
->dst
)->rt_flags
&
1407 (RTCF_BROADCAST
|RTCF_MULTICAST
))
1410 /* XXX: Check against a global syn pool counter. */
1411 if (BACKLOG(sk
) > BACKLOGMAX(sk
)) {
1412 #ifdef CONFIG_SYN_COOKIES
1413 if (sysctl_tcp_syncookies
&& !isn
) {
1414 syn_flood_warning(skb
);
1421 isn
= tcp_v4_init_sequence(sk
, skb
);
1425 req
= tcp_openreq_alloc();
1430 req
->rcv_wnd
= 0; /* So that tcp_send_synack() knows! */
1432 req
->rcv_isn
= TCP_SKB_CB(skb
)->seq
;
1433 tp
.tstamp_ok
= tp
.sack_ok
= tp
.wscale_ok
= tp
.snd_wscale
= 0;
1436 tp
.user_mss
= sk
->tp_pinfo
.af_tcp
.user_mss
;
1438 tcp_parse_options(NULL
, th
, &tp
, want_cookie
);
1440 req
->mss
= tp
.mss_clamp
;
1441 req
->ts_recent
= tp
.saw_tstamp
? tp
.rcv_tsval
: 0;
1442 req
->tstamp_ok
= tp
.tstamp_ok
;
1443 req
->sack_ok
= tp
.sack_ok
;
1444 req
->snd_wscale
= tp
.snd_wscale
;
1445 req
->wscale_ok
= tp
.wscale_ok
;
1446 req
->rmt_port
= th
->source
;
1447 req
->af
.v4_req
.loc_addr
= daddr
;
1448 req
->af
.v4_req
.rmt_addr
= saddr
;
1450 /* Note that we ignore the isn passed from the TIME_WAIT
1451 * state here. That's the price we pay for cookies.
1453 * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
1454 * and should reject connection attempt, duplicates with random
1455 * sequence number can corrupt data. Right?
1456 * I disabled sending cookie to request matching to a timewait
1460 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1464 req
->af
.v4_req
.opt
= tcp_v4_save_options(sk
, skb
);
1466 req
->class = &or_ipv4
;
1470 tcp_v4_send_synack(sk
, req
);
1473 if (req
->af
.v4_req
.opt
)
1474 kfree(req
->af
.v4_req
.opt
);
1475 tcp_v4_or_free(req
);
1476 tcp_openreq_free(req
);
1478 req
->expires
= jiffies
+ TCP_TIMEOUT_INIT
;
1479 tcp_inc_slow_timer(TCP_SLT_SYNACK
);
1480 tcp_synq_queue(&sk
->tp_pinfo
.af_tcp
, req
);
1489 tcp_statistics
.TcpAttemptFails
++;
1495 * The three way handshake has completed - we got a valid synack -
1496 * now create the new socket.
1498 struct sock
* tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1499 struct open_request
*req
,
1500 struct dst_entry
*dst
)
1502 struct ip_options
*opt
= req
->af
.v4_req
.opt
;
1503 struct tcp_opt
*newtp
;
1506 if (sk
->ack_backlog
> sk
->max_ack_backlog
)
1507 goto exit
; /* head drop */
1511 if (ip_route_output(&rt
,
1512 opt
&& opt
->srr
? opt
->faddr
: req
->af
.v4_req
.rmt_addr
,
1513 req
->af
.v4_req
.loc_addr
, sk
->protinfo
.af_inet
.tos
|RTO_CONN
, 0))
1518 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1522 sk
->tp_pinfo
.af_tcp
.syn_backlog
--;
1525 newsk
->dst_cache
= dst
;
1527 newtp
= &(newsk
->tp_pinfo
.af_tcp
);
1528 newsk
->daddr
= req
->af
.v4_req
.rmt_addr
;
1529 newsk
->saddr
= req
->af
.v4_req
.loc_addr
;
1530 newsk
->rcv_saddr
= req
->af
.v4_req
.loc_addr
;
1531 newsk
->protinfo
.af_inet
.opt
= req
->af
.v4_req
.opt
;
1532 newsk
->protinfo
.af_inet
.mc_index
= ((struct rtable
*)skb
->dst
)->rt_iif
;
1533 newsk
->protinfo
.af_inet
.mc_ttl
= skb
->nh
.iph
->ttl
;
1534 newtp
->ext_header_len
= 0;
1535 if (newsk
->protinfo
.af_inet
.opt
)
1536 newtp
->ext_header_len
= newsk
->protinfo
.af_inet
.opt
->optlen
;
1538 tcp_sync_mss(newsk
, dst
->pmtu
);
1539 tcp_initialize_rcv_mss(newsk
);
1541 if (newsk
->rcvbuf
< (3 * (dst
->advmss
+40+MAX_HEADER
+15)))
1542 newsk
->rcvbuf
= min ((3 * (dst
->advmss
+40+MAX_HEADER
+15)), sysctl_rmem_max
);
1543 if (newsk
->sndbuf
< (3 * (newtp
->mss_clamp
+40+MAX_HEADER
+15)))
1544 newsk
->sndbuf
= min ((3 * (newtp
->mss_clamp
+40+MAX_HEADER
+15)), sysctl_wmem_max
);
1546 bh_lock_sock(newsk
);
1548 __tcp_v4_hash(newsk
);
1549 __tcp_inherit_port(sk
, newsk
);
1559 static struct sock
*tcp_v4_hnd_req(struct sock
*sk
,struct sk_buff
*skb
)
1561 struct open_request
*req
, *prev
;
1562 struct tcphdr
*th
= skb
->h
.th
;
1563 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1565 /* Find possible connection requests. */
1566 req
= tcp_v4_search_req(tp
, skb
->nh
.iph
, th
, &prev
);
1568 return tcp_check_req(sk
, skb
, req
, prev
);
1570 #ifdef CONFIG_SYN_COOKIES
1571 if (!th
->rst
&& (th
->syn
|| th
->ack
))
1572 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1577 static int tcp_csum_verify(struct sk_buff
*skb
)
1579 switch (skb
->ip_summed
) {
1581 skb
->csum
= csum_partial((char *)skb
->h
.th
, skb
->len
, 0);
1583 if (tcp_v4_check(skb
->h
.th
,skb
->len
,skb
->nh
.iph
->saddr
,skb
->nh
.iph
->daddr
,skb
->csum
)) {
1584 NETDEBUG(printk(KERN_DEBUG
"TCPv4 bad checksum "
1585 "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1587 NIPQUAD(skb
->nh
.iph
->saddr
),
1588 ntohs(skb
->h
.th
->source
),
1589 NIPQUAD(skb
->nh
.iph
->daddr
),
1590 ntohs(skb
->h
.th
->dest
),
1592 ntohs(skb
->nh
.iph
->tot_len
)));
1595 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1597 /* CHECKSUM_UNNECESSARY */
1603 /* The socket must have it's spinlock held when we get
1606 * We have a potential double-lock case here, so even when
1607 * doing backlog processing we use the BH locking scheme.
1608 * This is because we cannot sleep with the original spinlock
1611 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1613 #ifdef CONFIG_FILTER
1614 struct sk_filter
*filter
= sk
->filter
;
1615 if (filter
&& sk_filter(skb
, filter
))
1617 #endif /* CONFIG_FILTER */
1620 * This doesn't check if the socket has enough room for the packet.
1621 * Either process the packet _without_ queueing it and then free it,
1622 * or do the check later.
1624 skb_set_owner_r(skb
, sk
);
1626 if (sk
->state
== TCP_ESTABLISHED
) { /* Fast path */
1627 /* Ready to move deeper ... */
1628 if (tcp_csum_verify(skb
))
1630 if (tcp_rcv_established(sk
, skb
, skb
->h
.th
, skb
->len
))
1635 if (tcp_csum_verify(skb
))
1638 if (sk
->state
== TCP_LISTEN
) {
1641 nsk
= tcp_v4_hnd_req(sk
, skb
);
1646 * Queue it on the new socket if the new socket is active,
1647 * otherwise we just shortcircuit this and continue with
1652 int state
= nsk
->state
;
1656 BUG_TRAP(nsk
->lock
.users
== 0);
1657 skb_set_owner_r(skb
, nsk
);
1658 ret
= tcp_rcv_state_process(nsk
, skb
, skb
->h
.th
, skb
->len
);
1660 /* Wakeup parent, send SIGIO, if this packet changed
1661 socket state from SYN-RECV.
1663 It still looks ugly, however it is much better
1664 than miracleous double wakeup in syn_recv_sock()
1665 and tcp_rcv_state_process().
1667 if (state
== TCP_SYN_RECV
&& nsk
->state
!= state
)
1668 sk
->data_ready(sk
, 0);
1670 bh_unlock_sock(nsk
);
1677 if (tcp_rcv_state_process(sk
, skb
, skb
->h
.th
, skb
->len
))
1682 tcp_v4_send_reset(skb
);
1685 /* Be careful here. If this function gets more complicated and
1686 * gcc suffers from register pressure on the x86, sk (in %ebx)
1687 * might be destroyed here. This current version compiles correctly,
1688 * but you have been warned.
1693 tcp_statistics
.TcpInErrs
++;
1701 int tcp_v4_rcv(struct sk_buff
*skb
, unsigned short len
)
1707 if (skb
->pkt_type
!=PACKET_HOST
)
1712 /* Pull up the IP header. */
1713 __skb_pull(skb
, skb
->h
.raw
- skb
->data
);
1715 /* Count it even if it's bad */
1716 tcp_statistics
.TcpInSegs
++;
1718 if (len
< sizeof(struct tcphdr
))
1721 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1722 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1724 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1725 TCP_SKB_CB(skb
)->when
= 0;
1728 sk
= __tcp_v4_lookup(skb
->nh
.iph
->saddr
, th
->source
,
1729 skb
->nh
.iph
->daddr
, ntohs(th
->dest
), skb
->dev
->ifindex
);
1735 if(!ipsec_sk_policy(sk
,skb
))
1736 goto discard_and_relse
;
1738 if (sk
->state
== TCP_TIME_WAIT
)
1743 if (!sk
->lock
.users
)
1744 ret
= tcp_v4_do_rcv(sk
, skb
);
1746 sk_add_backlog(sk
, skb
);
1754 if (tcp_csum_verify(skb
)) {
1756 tcp_statistics
.TcpInErrs
++;
1758 tcp_v4_send_reset(skb
);
1762 /* Discard frame. */
1771 if (tcp_csum_verify(skb
)) {
1772 tcp_statistics
.TcpInErrs
++;
1773 goto discard_and_relse
;
1775 switch(tcp_timewait_state_process((struct tcp_tw_bucket
*)sk
,
1776 skb
, th
, skb
->len
)) {
1781 sk2
= tcp_v4_lookup_listener(skb
->nh
.iph
->daddr
, ntohs(th
->dest
), skb
->dev
->ifindex
);
1783 tcp_tw_deschedule((struct tcp_tw_bucket
*)sk
);
1784 tcp_timewait_kill((struct tcp_tw_bucket
*)sk
);
1785 tcp_tw_put((struct tcp_tw_bucket
*)sk
);
1789 /* Fall through to ACK */
1792 tcp_v4_timewait_ack(sk
, skb
);
1796 case TCP_TW_SUCCESS
:
1801 static void __tcp_v4_rehash(struct sock
*sk
)
1803 struct tcp_ehash_bucket
*oldhead
= &tcp_ehash
[sk
->hashent
];
1804 struct tcp_ehash_bucket
*head
= &tcp_ehash
[(sk
->hashent
= tcp_sk_hashfn(sk
))];
1805 struct sock
**skp
= &head
->chain
;
1807 write_lock_bh(&oldhead
->lock
);
1810 sk
->next
->pprev
= sk
->pprev
;
1811 *sk
->pprev
= sk
->next
;
1814 write_unlock(&oldhead
->lock
);
1815 write_lock(&head
->lock
);
1816 if((sk
->next
= *skp
) != NULL
)
1817 (*skp
)->pprev
= &sk
->next
;
1820 write_unlock_bh(&head
->lock
);
1823 int tcp_v4_rebuild_header(struct sock
*sk
)
1825 struct rtable
*rt
= (struct rtable
*)__sk_dst_get(sk
);
1827 int want_rewrite
= sysctl_ip_dynaddr
&& sk
->state
== TCP_SYN_SENT
;
1832 /* Force route checking if want_rewrite.
1833 * The idea is good, the implementation is disguisting.
1834 * Well, if I made bind on this socket, you cannot randomly ovewrite
1835 * its source address. --ANK
1839 struct rtable
*new_rt
;
1840 __u32 old_saddr
= rt
->rt_src
;
1842 /* Query new route using another rt buffer */
1843 tmp
= ip_route_connect(&new_rt
, rt
->rt_dst
, 0,
1844 RT_TOS(sk
->protinfo
.af_inet
.tos
)|sk
->localroute
,
1847 /* Only useful if different source addrs */
1850 * Only useful if different source addrs
1852 if (new_rt
->rt_src
!= old_saddr
) {
1853 __sk_dst_set(sk
, &new_rt
->u
.dst
);
1857 dst_release(&new_rt
->u
.dst
);
1860 if (rt
->u
.dst
.obsolete
) {
1862 err
= ip_route_output(&rt
, rt
->rt_dst
, rt
->rt_src
, rt
->key
.tos
|RTO_CONN
, rt
->key
.oif
);
1865 sk
->error_report(sk
);
1868 __sk_dst_set(sk
, &rt
->u
.dst
);
1874 new_saddr
= rt
->rt_src
;
1876 /* Ouch!, this should not happen. */
1877 if (!sk
->saddr
|| !sk
->rcv_saddr
) {
1878 printk(KERN_WARNING
"tcp_v4_rebuild_header(): not valid sock addrs: "
1879 "saddr=%08lX rcv_saddr=%08lX\n",
1881 ntohl(sk
->rcv_saddr
));
1885 if (new_saddr
!= sk
->saddr
) {
1886 if (sysctl_ip_dynaddr
> 1) {
1887 printk(KERN_INFO
"tcp_v4_rebuild_header(): shifting sk->saddr "
1888 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1890 NIPQUAD(new_saddr
));
1893 sk
->saddr
= new_saddr
;
1894 sk
->rcv_saddr
= new_saddr
;
1896 /* XXX The only one ugly spot where we need to
1897 * XXX really change the sockets identity after
1898 * XXX it has entered the hashes. -DaveM
1900 * Besides that, it does not check for connetion
1901 * uniqueness. Wait for troubles.
1903 __tcp_v4_rehash(sk
);
1909 static void v4_addr2sockaddr(struct sock
*sk
, struct sockaddr
* uaddr
)
1911 struct sockaddr_in
*sin
= (struct sockaddr_in
*) uaddr
;
1913 sin
->sin_family
= AF_INET
;
1914 sin
->sin_addr
.s_addr
= sk
->daddr
;
1915 sin
->sin_port
= sk
->dport
;
1918 struct tcp_func ipv4_specific
= {
1921 tcp_v4_rebuild_header
,
1922 tcp_v4_conn_request
,
1923 tcp_v4_syn_recv_sock
,
1924 tcp_v4_hash_connecting
,
1925 sizeof(struct iphdr
),
1930 sizeof(struct sockaddr_in
)
1933 /* NOTE: A lot of things set to zero explicitly by call to
1934 * sk_alloc() so need not be done here.
1936 static int tcp_v4_init_sock(struct sock
*sk
)
1938 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1940 skb_queue_head_init(&tp
->out_of_order_queue
);
1941 tcp_init_xmit_timers(sk
);
1943 tp
->rto
= TCP_TIMEOUT_INIT
;
1944 tp
->mdev
= TCP_TIMEOUT_INIT
;
1946 /* So many TCP implementations out there (incorrectly) count the
1947 * initial SYN frame in their delayed-ACK and congestion control
1948 * algorithms that we must have the following bandaid to talk
1949 * efficiently to them. -DaveM
1953 /* See draft-stevens-tcpca-spec-01 for discussion of the
1954 * initialization of these values.
1956 tp
->snd_cwnd_cnt
= 0;
1957 tp
->snd_ssthresh
= 0x7fffffff; /* Infinity */
1958 tp
->snd_cwnd_clamp
= ~0;
1959 tp
->mss_cache
= 536;
1961 sk
->state
= TCP_CLOSE
;
1962 sk
->max_ack_backlog
= SOMAXCONN
;
1964 sk
->write_space
= tcp_write_space
;
1966 /* Init SYN queue. */
1969 sk
->tp_pinfo
.af_tcp
.af_specific
= &ipv4_specific
;
1974 static int tcp_v4_destroy_sock(struct sock
*sk
)
1976 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1978 tcp_clear_xmit_timers(sk
);
1980 /* Cleanup up the write buffer. */
1981 __skb_queue_purge(&sk
->write_queue
);
1983 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1984 __skb_queue_purge(&tp
->out_of_order_queue
);
1986 /* Clean up a referenced TCP bind bucket, this only happens if a
1987 * port is allocated for a socket, but it never fully connects.
1989 if(sk
->prev
!= NULL
)
1995 /* Proc filesystem TCP sock list dumping. */
1996 static void get_openreq(struct sock
*sk
, struct open_request
*req
, char *tmpbuf
, int i
)
1998 sprintf(tmpbuf
, "%4d: %08lX:%04X %08lX:%04X"
1999 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2001 (long unsigned int)req
->af
.v4_req
.loc_addr
,
2003 (long unsigned int)req
->af
.v4_req
.rmt_addr
,
2004 ntohs(req
->rmt_port
),
2006 0,0, /* could print option size, but that is af dependent. */
2007 1, /* timers active (only the expire timer) */
2008 (unsigned long)(req
->expires
- jiffies
),
2010 sk
->socket
? sk
->socket
->inode
->i_uid
: 0,
2011 0, /* non standard timer */
2012 0, /* open_requests have no inode */
2013 atomic_read(&sk
->refcnt
),
2018 static void get_tcp_sock(struct sock
*sp
, char *tmpbuf
, int i
)
2020 unsigned int dest
, src
;
2022 int timer_active
, timer_active1
, timer_active2
;
2023 unsigned long timer_expires
;
2024 struct tcp_opt
*tp
= &sp
->tp_pinfo
.af_tcp
;
2027 src
= sp
->rcv_saddr
;
2028 destp
= ntohs(sp
->dport
);
2029 srcp
= ntohs(sp
->sport
);
2030 timer_active1
= tp
->retransmit_timer
.prev
!= NULL
;
2031 timer_active2
= sp
->timer
.prev
!= NULL
;
2033 timer_expires
= (unsigned) -1;
2034 if (timer_active1
&& tp
->retransmit_timer
.expires
< timer_expires
) {
2036 timer_expires
= tp
->retransmit_timer
.expires
;
2038 if (timer_active2
&& sp
->timer
.expires
< timer_expires
) {
2040 timer_expires
= sp
->timer
.expires
;
2042 if(timer_active
== 0)
2043 timer_expires
= jiffies
;
2045 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2046 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
2047 i
, src
, srcp
, dest
, destp
, sp
->state
,
2048 tp
->write_seq
-tp
->snd_una
, tp
->rcv_nxt
-tp
->copied_seq
,
2049 timer_active
, timer_expires
-jiffies
,
2051 sp
->socket
? sp
->socket
->inode
->i_uid
: 0,
2053 sp
->socket
? sp
->socket
->inode
->i_ino
: 0,
2054 atomic_read(&sp
->refcnt
), sp
);
2057 static void get_timewait_sock(struct tcp_tw_bucket
*tw
, char *tmpbuf
, int i
)
2059 unsigned int dest
, src
;
2064 src
= tw
->rcv_saddr
;
2065 destp
= ntohs(tw
->dport
);
2066 srcp
= ntohs(tw
->sport
);
2068 slot_dist
= tw
->death_slot
;
2069 if(slot_dist
> tcp_tw_death_row_slot
)
2070 slot_dist
= (TCP_TWKILL_SLOTS
- slot_dist
) + tcp_tw_death_row_slot
;
2072 slot_dist
= tcp_tw_death_row_slot
- slot_dist
;
2074 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
2075 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2076 i
, src
, srcp
, dest
, destp
, TCP_TIME_WAIT
, 0, 0,
2077 3, slot_dist
* TCP_TWKILL_PERIOD
, 0, 0, 0, 0,
2078 atomic_read(&tw
->refcnt
), tw
);
2081 int tcp_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
2083 int len
= 0, num
= 0, i
;
2084 off_t begin
, pos
= 0;
2088 len
+= sprintf(buffer
, "%-127s\n",
2089 " sl local_address rem_address st tx_queue "
2090 "rx_queue tr tm->when retrnsmt uid timeout inode");
2094 /* First, walk listening socket table. */
2096 for(i
= 0; i
< TCP_LHTABLE_SIZE
; i
++) {
2097 struct sock
*sk
= tcp_listening_hash
[i
];
2099 for (sk
= tcp_listening_hash
[i
]; sk
; sk
= sk
->next
, num
++) {
2100 struct open_request
*req
;
2101 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
2103 if (!TCP_INET_FAMILY(sk
->family
))
2107 if (pos
>= offset
) {
2108 get_tcp_sock(sk
, tmpbuf
, num
);
2109 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2110 if (len
>= length
) {
2111 tcp_listen_unlock();
2118 for (req
= tp
->syn_wait_queue
; req
; req
= req
->dl_next
, num
++) {
2121 if (!TCP_INET_FAMILY(req
->class->family
))
2127 get_openreq(sk
, req
, tmpbuf
, num
);
2128 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2130 tcp_listen_unlock();
2138 tcp_listen_unlock();
2142 /* Next, walk established hash chain. */
2143 for (i
= 0; i
< tcp_ehash_size
; i
++) {
2144 struct tcp_ehash_bucket
*head
= &tcp_ehash
[i
];
2146 struct tcp_tw_bucket
*tw
;
2148 read_lock(&head
->lock
);
2149 for(sk
= head
->chain
; sk
; sk
= sk
->next
, num
++) {
2150 if (!TCP_INET_FAMILY(sk
->family
))
2155 get_tcp_sock(sk
, tmpbuf
, num
);
2156 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2158 read_unlock(&head
->lock
);
2162 for (tw
= (struct tcp_tw_bucket
*)tcp_ehash
[i
+tcp_ehash_size
].chain
;
2164 tw
= (struct tcp_tw_bucket
*)tw
->next
, num
++) {
2165 if (!TCP_INET_FAMILY(tw
->family
))
2170 get_timewait_sock(tw
, tmpbuf
, num
);
2171 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
2173 read_unlock(&head
->lock
);
2177 read_unlock(&head
->lock
);
2184 begin
= len
- (pos
- offset
);
2185 *start
= buffer
+ begin
;
2194 struct proto tcp_prot
= {
2195 tcp_close
, /* close */
2196 tcp_v4_connect
, /* connect */
2197 tcp_disconnect
, /* disconnect */
2198 tcp_accept
, /* accept */
2199 NULL
, /* retransmit */
2200 tcp_write_wakeup
, /* write_wakeup */
2201 tcp_read_wakeup
, /* read_wakeup */
2202 tcp_poll
, /* poll */
2203 tcp_ioctl
, /* ioctl */
2204 tcp_v4_init_sock
, /* init */
2205 tcp_v4_destroy_sock
, /* destroy */
2206 tcp_shutdown
, /* shutdown */
2207 tcp_setsockopt
, /* setsockopt */
2208 tcp_getsockopt
, /* getsockopt */
2209 tcp_v4_sendmsg
, /* sendmsg */
2210 tcp_recvmsg
, /* recvmsg */
2212 tcp_v4_do_rcv
, /* backlog_rcv */
2213 tcp_v4_hash
, /* hash */
2214 tcp_unhash
, /* unhash */
2215 tcp_v4_get_port
, /* get_port */
2216 128, /* max_header */
2217 0, /* retransmits */
2220 0 /* highestinuse */
2225 void __init
tcp_v4_init(struct net_proto_family
*ops
)
2229 tcp_inode
.i_mode
= S_IFSOCK
;
2230 tcp_inode
.i_sock
= 1;
2231 tcp_inode
.i_uid
= 0;
2232 tcp_inode
.i_gid
= 0;
2233 init_waitqueue_head(&tcp_inode
.i_wait
);
2234 init_waitqueue_head(&tcp_inode
.u
.socket_i
.wait
);
2236 tcp_socket
->inode
= &tcp_inode
;
2237 tcp_socket
->state
= SS_UNCONNECTED
;
2238 tcp_socket
->type
=SOCK_RAW
;
2240 if ((err
=ops
->create(tcp_socket
, IPPROTO_TCP
))<0)
2241 panic("Failed to create the TCP control socket.\n");
2242 tcp_socket
->sk
->allocation
=GFP_ATOMIC
;
2243 tcp_socket
->sk
->protinfo
.af_inet
.ttl
= MAXTTL
;
2245 /* Unhash it so that IP input processing does not even
2246 * see it, we do not wish this socket to see incoming
2249 tcp_socket
->sk
->prot
->unhash(tcp_socket
->sk
);