Linux-2.6.12-rc2
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / tcp_ipv4.c
blob3ac6659869c41eb824bed89dcda58de5ae44d14d
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
66 #include <net/icmp.h>
67 #include <net/tcp.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/xfrm.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb);
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
141 return tb;
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
204 return node != NULL;
207 /* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
226 do {
227 rover++;
228 if (rover < low || rover > high)
229 rover = low;
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
234 goto next;
235 break;
236 next:
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
242 /* Exhausted local port range during search? */
243 ret = 1;
244 if (remaining <= 0)
245 goto fail;
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
250 snum = rover;
251 } else {
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
256 goto tb_found;
258 tb = NULL;
259 goto tb_not_found;
260 tb_found:
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
263 goto success;
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
266 goto success;
267 } else {
268 ret = 1;
269 if (tcp_bind_conflict(sk, tb))
270 goto fail_unlock;
273 tb_not_found:
274 ret = 1;
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276 goto fail_unlock;
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
279 tb->fastreuse = 1;
280 else
281 tb->fastreuse = 0;
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
284 tb->fastreuse = 0;
285 success:
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
289 ret = 0;
291 fail_unlock:
292 spin_unlock(&head->lock);
293 fail:
294 local_bh_enable();
295 return ret;
298 /* Get rid of any references to a local port held by the
299 * given sock.
301 static void __tcp_put_port(struct sock *sk)
303 struct inet_sock *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
311 inet->num = 0;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
316 void tcp_put_port(struct sock *sk)
318 local_bh_disable();
319 __tcp_put_port(sk);
320 local_bh_enable();
323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
330 void tcp_listen_wlock(void)
332 write_lock(&tcp_lhash_lock);
334 if (atomic_read(&tcp_lhash_users)) {
335 DEFINE_WAIT(wait);
337 for (;;) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
341 break;
342 write_unlock_bh(&tcp_lhash_lock);
343 schedule();
344 write_lock_bh(&tcp_lhash_lock);
347 finish_wait(&tcp_lhash_wait, &wait);
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
353 struct hlist_head *list;
354 rwlock_t *lock;
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
360 tcp_listen_wlock();
361 } else {
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
364 write_lock(lock);
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
368 write_unlock(lock);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 static void tcp_v4_hash(struct sock *sk)
375 if (sk->sk_state != TCP_CLOSE) {
376 local_bh_disable();
377 __tcp_v4_hash(sk, 1);
378 local_bh_enable();
382 void tcp_unhash(struct sock *sk)
384 rwlock_t *lock;
386 if (sk_unhashed(sk))
387 goto ende;
389 if (sk->sk_state == TCP_LISTEN) {
390 local_bh_disable();
391 tcp_listen_wlock();
392 lock = &tcp_lhash_lock;
393 } else {
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395 lock = &head->lock;
396 write_lock_bh(&head->lock);
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
403 ende:
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
408 /* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
419 int score, hiscore;
421 hiscore=-1;
422 sk_for_each(sk, node, head) {
423 struct inet_sock *inet = inet_sk(sk);
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
428 score = (sk->sk_family == PF_INET ? 1 : 0);
429 if (rcv_saddr) {
430 if (rcv_saddr != daddr)
431 continue;
432 score+=2;
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
436 continue;
437 score+=2;
439 if (score == 5)
440 return sk;
441 if (score > hiscore) {
442 hiscore = score;
443 result = sk;
447 return result;
450 /* Optimize the common listener case. */
451 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
452 unsigned short hnum, int dif)
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
462 if (inet->num == hnum && !sk->sk_node.next &&
463 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
464 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
465 !sk->sk_bound_dev_if)
466 goto sherry_cache;
467 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
469 if (sk) {
470 sherry_cache:
471 sock_hold(sk);
473 read_unlock(&tcp_lhash_lock);
474 return sk;
477 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
480 * Local BH must be disabled here.
483 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
484 u32 daddr, u16 hnum,
485 int dif)
487 struct tcp_ehash_bucket *head;
488 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
489 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
490 struct sock *sk;
491 struct hlist_node *node;
492 /* Optimize here for direct hit, only listening connections can
493 * have wildcards anyways.
495 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
496 head = &tcp_ehash[hash];
497 read_lock(&head->lock);
498 sk_for_each(sk, node, &head->chain) {
499 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 goto hit; /* You sunk my battleship! */
503 /* Must check for a TIME_WAIT'er before going to listener hash. */
504 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
505 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
506 goto hit;
508 sk = NULL;
509 out:
510 read_unlock(&head->lock);
511 return sk;
512 hit:
513 sock_hold(sk);
514 goto out;
517 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
518 u32 daddr, u16 hnum, int dif)
520 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
521 daddr, hnum, dif);
523 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
526 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
527 u16 dport, int dif)
529 struct sock *sk;
531 local_bh_disable();
532 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
533 local_bh_enable();
535 return sk;
538 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
540 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
542 return secure_tcp_sequence_number(skb->nh.iph->daddr,
543 skb->nh.iph->saddr,
544 skb->h.th->dest,
545 skb->h.th->source);
548 /* called with local bh disabled */
549 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
550 struct tcp_tw_bucket **twp)
552 struct inet_sock *inet = inet_sk(sk);
553 u32 daddr = inet->rcv_saddr;
554 u32 saddr = inet->daddr;
555 int dif = sk->sk_bound_dev_if;
556 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
557 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
558 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
559 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
560 struct sock *sk2;
561 struct hlist_node *node;
562 struct tcp_tw_bucket *tw;
564 write_lock(&head->lock);
566 /* Check TIME-WAIT sockets first. */
567 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
568 tw = (struct tcp_tw_bucket *)sk2;
570 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
571 struct tcp_sock *tp = tcp_sk(sk);
573 /* With PAWS, it is safe from the viewpoint
574 of data integrity. Even without PAWS it
575 is safe provided sequence spaces do not
576 overlap i.e. at data rates <= 80Mbit/sec.
578 Actually, the idea is close to VJ's one,
579 only timestamp cache is held not per host,
580 but per port pair and TW bucket is used
581 as state holder.
583 If TW bucket has been already destroyed we
584 fall back to VJ's scheme and use initial
585 timestamp retrieved from peer table.
587 if (tw->tw_ts_recent_stamp &&
588 (!twp || (sysctl_tcp_tw_reuse &&
589 xtime.tv_sec -
590 tw->tw_ts_recent_stamp > 1))) {
591 if ((tp->write_seq =
592 tw->tw_snd_nxt + 65535 + 2) == 0)
593 tp->write_seq = 1;
594 tp->rx_opt.ts_recent = tw->tw_ts_recent;
595 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
596 sock_hold(sk2);
597 goto unique;
598 } else
599 goto not_unique;
602 tw = NULL;
604 /* And established part... */
605 sk_for_each(sk2, node, &head->chain) {
606 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
607 goto not_unique;
610 unique:
611 /* Must record num and sport now. Otherwise we will see
612 * in hash table socket with a funny identity. */
613 inet->num = lport;
614 inet->sport = htons(lport);
615 sk->sk_hashent = hash;
616 BUG_TRAP(sk_unhashed(sk));
617 __sk_add_node(sk, &head->chain);
618 sock_prot_inc_use(sk->sk_prot);
619 write_unlock(&head->lock);
621 if (twp) {
622 *twp = tw;
623 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
624 } else if (tw) {
625 /* Silly. Should hash-dance instead... */
626 tcp_tw_deschedule(tw);
627 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
629 tcp_tw_put(tw);
632 return 0;
634 not_unique:
635 write_unlock(&head->lock);
636 return -EADDRNOTAVAIL;
639 static inline u32 connect_port_offset(const struct sock *sk)
641 const struct inet_sock *inet = inet_sk(sk);
643 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
644 inet->dport);
648 * Bind a port for a connect operation and hash it.
650 static inline int tcp_v4_hash_connect(struct sock *sk)
652 unsigned short snum = inet_sk(sk)->num;
653 struct tcp_bind_hashbucket *head;
654 struct tcp_bind_bucket *tb;
655 int ret;
657 if (!snum) {
658 int low = sysctl_local_port_range[0];
659 int high = sysctl_local_port_range[1];
660 int range = high - low;
661 int i;
662 int port;
663 static u32 hint;
664 u32 offset = hint + connect_port_offset(sk);
665 struct hlist_node *node;
666 struct tcp_tw_bucket *tw = NULL;
668 local_bh_disable();
669 for (i = 1; i <= range; i++) {
670 port = low + (i + offset) % range;
671 head = &tcp_bhash[tcp_bhashfn(port)];
672 spin_lock(&head->lock);
674 /* Does not bother with rcv_saddr checks,
675 * because the established check is already
676 * unique enough.
678 tb_for_each(tb, node, &head->chain) {
679 if (tb->port == port) {
680 BUG_TRAP(!hlist_empty(&tb->owners));
681 if (tb->fastreuse >= 0)
682 goto next_port;
683 if (!__tcp_v4_check_established(sk,
684 port,
685 &tw))
686 goto ok;
687 goto next_port;
691 tb = tcp_bucket_create(head, port);
692 if (!tb) {
693 spin_unlock(&head->lock);
694 break;
696 tb->fastreuse = -1;
697 goto ok;
699 next_port:
700 spin_unlock(&head->lock);
702 local_bh_enable();
704 return -EADDRNOTAVAIL;
707 hint += i;
709 /* Head lock still held and bh's disabled */
710 tcp_bind_hash(sk, tb, port);
711 if (sk_unhashed(sk)) {
712 inet_sk(sk)->sport = htons(port);
713 __tcp_v4_hash(sk, 0);
715 spin_unlock(&head->lock);
717 if (tw) {
718 tcp_tw_deschedule(tw);
719 tcp_tw_put(tw);
722 ret = 0;
723 goto out;
726 head = &tcp_bhash[tcp_bhashfn(snum)];
727 tb = tcp_sk(sk)->bind_hash;
728 spin_lock_bh(&head->lock);
729 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
730 __tcp_v4_hash(sk, 0);
731 spin_unlock_bh(&head->lock);
732 return 0;
733 } else {
734 spin_unlock(&head->lock);
735 /* No definite answer... Walk to established hash table */
736 ret = __tcp_v4_check_established(sk, snum, NULL);
737 out:
738 local_bh_enable();
739 return ret;
743 /* This will initiate an outgoing connection. */
744 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
746 struct inet_sock *inet = inet_sk(sk);
747 struct tcp_sock *tp = tcp_sk(sk);
748 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
749 struct rtable *rt;
750 u32 daddr, nexthop;
751 int tmp;
752 int err;
754 if (addr_len < sizeof(struct sockaddr_in))
755 return -EINVAL;
757 if (usin->sin_family != AF_INET)
758 return -EAFNOSUPPORT;
760 nexthop = daddr = usin->sin_addr.s_addr;
761 if (inet->opt && inet->opt->srr) {
762 if (!daddr)
763 return -EINVAL;
764 nexthop = inet->opt->faddr;
767 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
768 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
769 IPPROTO_TCP,
770 inet->sport, usin->sin_port, sk);
771 if (tmp < 0)
772 return tmp;
774 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
775 ip_rt_put(rt);
776 return -ENETUNREACH;
779 if (!inet->opt || !inet->opt->srr)
780 daddr = rt->rt_dst;
782 if (!inet->saddr)
783 inet->saddr = rt->rt_src;
784 inet->rcv_saddr = inet->saddr;
786 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
787 /* Reset inherited state */
788 tp->rx_opt.ts_recent = 0;
789 tp->rx_opt.ts_recent_stamp = 0;
790 tp->write_seq = 0;
793 if (sysctl_tcp_tw_recycle &&
794 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
795 struct inet_peer *peer = rt_get_peer(rt);
797 /* VJ's idea. We save last timestamp seen from
798 * the destination in peer table, when entering state TIME-WAIT
799 * and initialize rx_opt.ts_recent from it, when trying new connection.
802 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
803 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
804 tp->rx_opt.ts_recent = peer->tcp_ts;
808 inet->dport = usin->sin_port;
809 inet->daddr = daddr;
811 tp->ext_header_len = 0;
812 if (inet->opt)
813 tp->ext_header_len = inet->opt->optlen;
815 tp->rx_opt.mss_clamp = 536;
817 /* Socket identity is still unknown (sport may be zero).
818 * However we set state to SYN-SENT and not releasing socket
819 * lock select source port, enter ourselves into the hash tables and
820 * complete initialization after this.
822 tcp_set_state(sk, TCP_SYN_SENT);
823 err = tcp_v4_hash_connect(sk);
824 if (err)
825 goto failure;
827 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
828 if (err)
829 goto failure;
831 /* OK, now commit destination to socket. */
832 __sk_dst_set(sk, &rt->u.dst);
833 tcp_v4_setup_caps(sk, &rt->u.dst);
835 if (!tp->write_seq)
836 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
837 inet->daddr,
838 inet->sport,
839 usin->sin_port);
841 inet->id = tp->write_seq ^ jiffies;
843 err = tcp_connect(sk);
844 rt = NULL;
845 if (err)
846 goto failure;
848 return 0;
850 failure:
851 /* This unhashes the socket and releases the local port, if necessary. */
852 tcp_set_state(sk, TCP_CLOSE);
853 ip_rt_put(rt);
854 sk->sk_route_caps = 0;
855 inet->dport = 0;
856 return err;
859 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
861 return ((struct rtable *)skb->dst)->rt_iif;
864 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
866 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
869 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
870 struct open_request ***prevp,
871 __u16 rport,
872 __u32 raddr, __u32 laddr)
874 struct tcp_listen_opt *lopt = tp->listen_opt;
875 struct open_request *req, **prev;
877 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
878 (req = *prev) != NULL;
879 prev = &req->dl_next) {
880 if (req->rmt_port == rport &&
881 req->af.v4_req.rmt_addr == raddr &&
882 req->af.v4_req.loc_addr == laddr &&
883 TCP_INET_FAMILY(req->class->family)) {
884 BUG_TRAP(!req->sk);
885 *prevp = prev;
886 break;
890 return req;
893 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
895 struct tcp_sock *tp = tcp_sk(sk);
896 struct tcp_listen_opt *lopt = tp->listen_opt;
897 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
899 req->expires = jiffies + TCP_TIMEOUT_INIT;
900 req->retrans = 0;
901 req->sk = NULL;
902 req->dl_next = lopt->syn_table[h];
904 write_lock(&tp->syn_wait_lock);
905 lopt->syn_table[h] = req;
906 write_unlock(&tp->syn_wait_lock);
908 tcp_synq_added(sk);
913 * This routine does path mtu discovery as defined in RFC1191.
915 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
916 u32 mtu)
918 struct dst_entry *dst;
919 struct inet_sock *inet = inet_sk(sk);
920 struct tcp_sock *tp = tcp_sk(sk);
922 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
923 * send out by Linux are always <576bytes so they should go through
924 * unfragmented).
926 if (sk->sk_state == TCP_LISTEN)
927 return;
929 /* We don't check in the destentry if pmtu discovery is forbidden
930 * on this route. We just assume that no packet_to_big packets
931 * are send back when pmtu discovery is not active.
932 * There is a small race when the user changes this flag in the
933 * route, but I think that's acceptable.
935 if ((dst = __sk_dst_check(sk, 0)) == NULL)
936 return;
938 dst->ops->update_pmtu(dst, mtu);
940 /* Something is about to be wrong... Remember soft error
941 * for the case, if this connection will not able to recover.
943 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
944 sk->sk_err_soft = EMSGSIZE;
946 mtu = dst_mtu(dst);
948 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
949 tp->pmtu_cookie > mtu) {
950 tcp_sync_mss(sk, mtu);
952 /* Resend the TCP packet because it's
953 * clear that the old packet has been
954 * dropped. This is the new "fast" path mtu
955 * discovery.
957 tcp_simple_retransmit(sk);
958 } /* else let the usual retransmit timer handle it */
962 * This routine is called by the ICMP module when it gets some
963 * sort of error condition. If err < 0 then the socket should
964 * be closed and the error returned to the user. If err > 0
965 * it's just the icmp type << 8 | icmp code. After adjustment
966 * header points to the first 8 bytes of the tcp header. We need
967 * to find the appropriate port.
969 * The locking strategy used here is very "optimistic". When
970 * someone else accesses the socket the ICMP is just dropped
971 * and for some paths there is no check at all.
972 * A more general error queue to queue errors for later handling
973 * is probably better.
977 void tcp_v4_err(struct sk_buff *skb, u32 info)
979 struct iphdr *iph = (struct iphdr *)skb->data;
980 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
981 struct tcp_sock *tp;
982 struct inet_sock *inet;
983 int type = skb->h.icmph->type;
984 int code = skb->h.icmph->code;
985 struct sock *sk;
986 __u32 seq;
987 int err;
989 if (skb->len < (iph->ihl << 2) + 8) {
990 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
991 return;
994 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
995 th->source, tcp_v4_iif(skb));
996 if (!sk) {
997 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
998 return;
1000 if (sk->sk_state == TCP_TIME_WAIT) {
1001 tcp_tw_put((struct tcp_tw_bucket *)sk);
1002 return;
1005 bh_lock_sock(sk);
1006 /* If too many ICMPs get dropped on busy
1007 * servers this needs to be solved differently.
1009 if (sock_owned_by_user(sk))
1010 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1012 if (sk->sk_state == TCP_CLOSE)
1013 goto out;
1015 tp = tcp_sk(sk);
1016 seq = ntohl(th->seq);
1017 if (sk->sk_state != TCP_LISTEN &&
1018 !between(seq, tp->snd_una, tp->snd_nxt)) {
1019 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1020 goto out;
1023 switch (type) {
1024 case ICMP_SOURCE_QUENCH:
1025 /* Just silently ignore these. */
1026 goto out;
1027 case ICMP_PARAMETERPROB:
1028 err = EPROTO;
1029 break;
1030 case ICMP_DEST_UNREACH:
1031 if (code > NR_ICMP_UNREACH)
1032 goto out;
1034 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1035 if (!sock_owned_by_user(sk))
1036 do_pmtu_discovery(sk, iph, info);
1037 goto out;
1040 err = icmp_err_convert[code].errno;
1041 break;
1042 case ICMP_TIME_EXCEEDED:
1043 err = EHOSTUNREACH;
1044 break;
1045 default:
1046 goto out;
1049 switch (sk->sk_state) {
1050 struct open_request *req, **prev;
1051 case TCP_LISTEN:
1052 if (sock_owned_by_user(sk))
1053 goto out;
1055 req = tcp_v4_search_req(tp, &prev, th->dest,
1056 iph->daddr, iph->saddr);
1057 if (!req)
1058 goto out;
1060 /* ICMPs are not backlogged, hence we cannot get
1061 an established socket here.
1063 BUG_TRAP(!req->sk);
1065 if (seq != req->snt_isn) {
1066 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1067 goto out;
1071 * Still in SYN_RECV, just remove it silently.
1072 * There is no good way to pass the error to the newly
1073 * created socket, and POSIX does not want network
1074 * errors returned from accept().
1076 tcp_synq_drop(sk, req, prev);
1077 goto out;
1079 case TCP_SYN_SENT:
1080 case TCP_SYN_RECV: /* Cannot happen.
1081 It can f.e. if SYNs crossed.
1083 if (!sock_owned_by_user(sk)) {
1084 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1085 sk->sk_err = err;
1087 sk->sk_error_report(sk);
1089 tcp_done(sk);
1090 } else {
1091 sk->sk_err_soft = err;
1093 goto out;
1096 /* If we've already connected we will keep trying
1097 * until we time out, or the user gives up.
1099 * rfc1122 4.2.3.9 allows to consider as hard errors
1100 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1101 * but it is obsoleted by pmtu discovery).
1103 * Note, that in modern internet, where routing is unreliable
1104 * and in each dark corner broken firewalls sit, sending random
1105 * errors ordered by their masters even this two messages finally lose
1106 * their original sense (even Linux sends invalid PORT_UNREACHs)
1108 * Now we are in compliance with RFCs.
1109 * --ANK (980905)
1112 inet = inet_sk(sk);
1113 if (!sock_owned_by_user(sk) && inet->recverr) {
1114 sk->sk_err = err;
1115 sk->sk_error_report(sk);
1116 } else { /* Only an error on timeout */
1117 sk->sk_err_soft = err;
1120 out:
1121 bh_unlock_sock(sk);
1122 sock_put(sk);
1125 /* This routine computes an IPv4 TCP checksum. */
1126 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1127 struct sk_buff *skb)
1129 struct inet_sock *inet = inet_sk(sk);
1131 if (skb->ip_summed == CHECKSUM_HW) {
1132 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1133 skb->csum = offsetof(struct tcphdr, check);
1134 } else {
1135 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1136 csum_partial((char *)th,
1137 th->doff << 2,
1138 skb->csum));
1143 * This routine will send an RST to the other tcp.
1145 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1146 * for reset.
1147 * Answer: if a packet caused RST, it is not for a socket
1148 * existing in our system, if it is matched to a socket,
1149 * it is just duplicate segment or bug in other side's TCP.
1150 * So that we build reply only basing on parameters
1151 * arrived with segment.
1152 * Exception: precedence violation. We do not implement it in any case.
1155 static void tcp_v4_send_reset(struct sk_buff *skb)
1157 struct tcphdr *th = skb->h.th;
1158 struct tcphdr rth;
1159 struct ip_reply_arg arg;
1161 /* Never send a reset in response to a reset. */
1162 if (th->rst)
1163 return;
1165 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1166 return;
1168 /* Swap the send and the receive. */
1169 memset(&rth, 0, sizeof(struct tcphdr));
1170 rth.dest = th->source;
1171 rth.source = th->dest;
1172 rth.doff = sizeof(struct tcphdr) / 4;
1173 rth.rst = 1;
1175 if (th->ack) {
1176 rth.seq = th->ack_seq;
1177 } else {
1178 rth.ack = 1;
1179 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1180 skb->len - (th->doff << 2));
1183 memset(&arg, 0, sizeof arg);
1184 arg.iov[0].iov_base = (unsigned char *)&rth;
1185 arg.iov[0].iov_len = sizeof rth;
1186 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1187 skb->nh.iph->saddr, /*XXX*/
1188 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1189 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1191 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1193 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1194 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1197 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1198 outside socket context is ugly, certainly. What can I do?
1201 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1202 u32 win, u32 ts)
1204 struct tcphdr *th = skb->h.th;
1205 struct {
1206 struct tcphdr th;
1207 u32 tsopt[3];
1208 } rep;
1209 struct ip_reply_arg arg;
1211 memset(&rep.th, 0, sizeof(struct tcphdr));
1212 memset(&arg, 0, sizeof arg);
1214 arg.iov[0].iov_base = (unsigned char *)&rep;
1215 arg.iov[0].iov_len = sizeof(rep.th);
1216 if (ts) {
1217 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1218 (TCPOPT_TIMESTAMP << 8) |
1219 TCPOLEN_TIMESTAMP);
1220 rep.tsopt[1] = htonl(tcp_time_stamp);
1221 rep.tsopt[2] = htonl(ts);
1222 arg.iov[0].iov_len = sizeof(rep);
1225 /* Swap the send and the receive. */
1226 rep.th.dest = th->source;
1227 rep.th.source = th->dest;
1228 rep.th.doff = arg.iov[0].iov_len / 4;
1229 rep.th.seq = htonl(seq);
1230 rep.th.ack_seq = htonl(ack);
1231 rep.th.ack = 1;
1232 rep.th.window = htons(win);
1234 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1235 skb->nh.iph->saddr, /*XXX*/
1236 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1237 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1239 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1241 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1244 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1246 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1248 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1249 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1251 tcp_tw_put(tw);
1254 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1256 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1257 req->ts_recent);
1260 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1261 struct open_request *req)
1263 struct rtable *rt;
1264 struct ip_options *opt = req->af.v4_req.opt;
1265 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1266 .nl_u = { .ip4_u =
1267 { .daddr = ((opt && opt->srr) ?
1268 opt->faddr :
1269 req->af.v4_req.rmt_addr),
1270 .saddr = req->af.v4_req.loc_addr,
1271 .tos = RT_CONN_FLAGS(sk) } },
1272 .proto = IPPROTO_TCP,
1273 .uli_u = { .ports =
1274 { .sport = inet_sk(sk)->sport,
1275 .dport = req->rmt_port } } };
1277 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1278 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1279 return NULL;
1281 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1282 ip_rt_put(rt);
1283 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1284 return NULL;
1286 return &rt->u.dst;
1290 * Send a SYN-ACK after having received an ACK.
1291 * This still operates on a open_request only, not on a big
1292 * socket.
1294 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1295 struct dst_entry *dst)
1297 int err = -1;
1298 struct sk_buff * skb;
1300 /* First, grab a route. */
1301 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1302 goto out;
1304 skb = tcp_make_synack(sk, dst, req);
1306 if (skb) {
1307 struct tcphdr *th = skb->h.th;
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr,
1311 req->af.v4_req.rmt_addr,
1312 csum_partial((char *)th, skb->len,
1313 skb->csum));
1315 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1316 req->af.v4_req.rmt_addr,
1317 req->af.v4_req.opt);
1318 if (err == NET_XMIT_CN)
1319 err = 0;
1322 out:
1323 dst_release(dst);
1324 return err;
1328 * IPv4 open_request destructor.
1330 static void tcp_v4_or_free(struct open_request *req)
1332 if (req->af.v4_req.opt)
1333 kfree(req->af.v4_req.opt);
1336 static inline void syn_flood_warning(struct sk_buff *skb)
1338 static unsigned long warntime;
1340 if (time_after(jiffies, (warntime + HZ * 60))) {
1341 warntime = jiffies;
1342 printk(KERN_INFO
1343 "possible SYN flooding on port %d. Sending cookies.\n",
1344 ntohs(skb->h.th->dest));
1349 * Save and compile IPv4 options into the open_request if needed.
1351 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1352 struct sk_buff *skb)
1354 struct ip_options *opt = &(IPCB(skb)->opt);
1355 struct ip_options *dopt = NULL;
1357 if (opt && opt->optlen) {
1358 int opt_size = optlength(opt);
1359 dopt = kmalloc(opt_size, GFP_ATOMIC);
1360 if (dopt) {
1361 if (ip_options_echo(dopt, skb)) {
1362 kfree(dopt);
1363 dopt = NULL;
1367 return dopt;
1371 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1372 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1373 * It would be better to replace it with a global counter for all sockets
1374 * but then some measure against one socket starving all other sockets
1375 * would be needed.
1377 * It was 128 by default. Experiments with real servers show, that
1378 * it is absolutely not enough even at 100conn/sec. 256 cures most
1379 * of problems. This value is adjusted to 128 for very small machines
1380 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1381 * Further increasing requires to change hash table size.
1383 int sysctl_max_syn_backlog = 256;
1385 struct or_calltable or_ipv4 = {
1386 .family = PF_INET,
1387 .rtx_syn_ack = tcp_v4_send_synack,
1388 .send_ack = tcp_v4_or_send_ack,
1389 .destructor = tcp_v4_or_free,
1390 .send_reset = tcp_v4_send_reset,
1393 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1395 struct tcp_options_received tmp_opt;
1396 struct open_request *req;
1397 __u32 saddr = skb->nh.iph->saddr;
1398 __u32 daddr = skb->nh.iph->daddr;
1399 __u32 isn = TCP_SKB_CB(skb)->when;
1400 struct dst_entry *dst = NULL;
1401 #ifdef CONFIG_SYN_COOKIES
1402 int want_cookie = 0;
1403 #else
1404 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1405 #endif
1407 /* Never answer to SYNs send to broadcast or multicast */
1408 if (((struct rtable *)skb->dst)->rt_flags &
1409 (RTCF_BROADCAST | RTCF_MULTICAST))
1410 goto drop;
1412 /* TW buckets are converted to open requests without
1413 * limitations, they conserve resources and peer is
1414 * evidently real one.
1416 if (tcp_synq_is_full(sk) && !isn) {
1417 #ifdef CONFIG_SYN_COOKIES
1418 if (sysctl_tcp_syncookies) {
1419 want_cookie = 1;
1420 } else
1421 #endif
1422 goto drop;
1425 /* Accept backlog is full. If we have already queued enough
1426 * of warm entries in syn queue, drop request. It is better than
1427 * clogging syn queue with openreqs with exponentially increasing
1428 * timeout.
1430 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1431 goto drop;
1433 req = tcp_openreq_alloc();
1434 if (!req)
1435 goto drop;
1437 tcp_clear_options(&tmp_opt);
1438 tmp_opt.mss_clamp = 536;
1439 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1441 tcp_parse_options(skb, &tmp_opt, 0);
1443 if (want_cookie) {
1444 tcp_clear_options(&tmp_opt);
1445 tmp_opt.saw_tstamp = 0;
1448 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1449 /* Some OSes (unknown ones, but I see them on web server, which
1450 * contains information interesting only for windows'
1451 * users) do not send their stamp in SYN. It is easy case.
1452 * We simply do not advertise TS support.
1454 tmp_opt.saw_tstamp = 0;
1455 tmp_opt.tstamp_ok = 0;
1457 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1459 tcp_openreq_init(req, &tmp_opt, skb);
1461 req->af.v4_req.loc_addr = daddr;
1462 req->af.v4_req.rmt_addr = saddr;
1463 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1464 req->class = &or_ipv4;
1465 if (!want_cookie)
1466 TCP_ECN_create_request(req, skb->h.th);
1468 if (want_cookie) {
1469 #ifdef CONFIG_SYN_COOKIES
1470 syn_flood_warning(skb);
1471 #endif
1472 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1473 } else if (!isn) {
1474 struct inet_peer *peer = NULL;
1476 /* VJ's idea. We save last timestamp seen
1477 * from the destination in peer table, when entering
1478 * state TIME-WAIT, and check against it before
1479 * accepting new connection request.
1481 * If "isn" is not zero, this request hit alive
1482 * timewait bucket, so that all the necessary checks
1483 * are made in the function processing timewait state.
1485 if (tmp_opt.saw_tstamp &&
1486 sysctl_tcp_tw_recycle &&
1487 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1488 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1489 peer->v4daddr == saddr) {
1490 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1491 (s32)(peer->tcp_ts - req->ts_recent) >
1492 TCP_PAWS_WINDOW) {
1493 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1494 dst_release(dst);
1495 goto drop_and_free;
1498 /* Kill the following clause, if you dislike this way. */
1499 else if (!sysctl_tcp_syncookies &&
1500 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1501 (sysctl_max_syn_backlog >> 2)) &&
1502 (!peer || !peer->tcp_ts_stamp) &&
1503 (!dst || !dst_metric(dst, RTAX_RTT))) {
1504 /* Without syncookies last quarter of
1505 * backlog is filled with destinations,
1506 * proven to be alive.
1507 * It means that we continue to communicate
1508 * to destinations, already remembered
1509 * to the moment of synflood.
1511 NETDEBUG(if (net_ratelimit()) \
1512 printk(KERN_DEBUG "TCP: drop open "
1513 "request from %u.%u."
1514 "%u.%u/%u\n", \
1515 NIPQUAD(saddr),
1516 ntohs(skb->h.th->source)));
1517 dst_release(dst);
1518 goto drop_and_free;
1521 isn = tcp_v4_init_sequence(sk, skb);
1523 req->snt_isn = isn;
1525 if (tcp_v4_send_synack(sk, req, dst))
1526 goto drop_and_free;
1528 if (want_cookie) {
1529 tcp_openreq_free(req);
1530 } else {
1531 tcp_v4_synq_add(sk, req);
1533 return 0;
1535 drop_and_free:
1536 tcp_openreq_free(req);
1537 drop:
1538 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1539 return 0;
1544 * The three way handshake has completed - we got a valid synack -
1545 * now create the new socket.
1547 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1548 struct open_request *req,
1549 struct dst_entry *dst)
1551 struct inet_sock *newinet;
1552 struct tcp_sock *newtp;
1553 struct sock *newsk;
1555 if (sk_acceptq_is_full(sk))
1556 goto exit_overflow;
1558 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1559 goto exit;
1561 newsk = tcp_create_openreq_child(sk, req, skb);
1562 if (!newsk)
1563 goto exit;
1565 newsk->sk_dst_cache = dst;
1566 tcp_v4_setup_caps(newsk, dst);
1568 newtp = tcp_sk(newsk);
1569 newinet = inet_sk(newsk);
1570 newinet->daddr = req->af.v4_req.rmt_addr;
1571 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1572 newinet->saddr = req->af.v4_req.loc_addr;
1573 newinet->opt = req->af.v4_req.opt;
1574 req->af.v4_req.opt = NULL;
1575 newinet->mc_index = tcp_v4_iif(skb);
1576 newinet->mc_ttl = skb->nh.iph->ttl;
1577 newtp->ext_header_len = 0;
1578 if (newinet->opt)
1579 newtp->ext_header_len = newinet->opt->optlen;
1580 newinet->id = newtp->write_seq ^ jiffies;
1582 tcp_sync_mss(newsk, dst_mtu(dst));
1583 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1584 tcp_initialize_rcv_mss(newsk);
1586 __tcp_v4_hash(newsk, 0);
1587 __tcp_inherit_port(sk, newsk);
1589 return newsk;
1591 exit_overflow:
1592 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1593 exit:
1594 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1595 dst_release(dst);
1596 return NULL;
1599 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1601 struct tcphdr *th = skb->h.th;
1602 struct iphdr *iph = skb->nh.iph;
1603 struct tcp_sock *tp = tcp_sk(sk);
1604 struct sock *nsk;
1605 struct open_request **prev;
1606 /* Find possible connection requests. */
1607 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1608 iph->saddr, iph->daddr);
1609 if (req)
1610 return tcp_check_req(sk, skb, req, prev);
1612 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1613 th->source,
1614 skb->nh.iph->daddr,
1615 ntohs(th->dest),
1616 tcp_v4_iif(skb));
1618 if (nsk) {
1619 if (nsk->sk_state != TCP_TIME_WAIT) {
1620 bh_lock_sock(nsk);
1621 return nsk;
1623 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1624 return NULL;
1627 #ifdef CONFIG_SYN_COOKIES
1628 if (!th->rst && !th->syn && th->ack)
1629 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1630 #endif
1631 return sk;
1634 static int tcp_v4_checksum_init(struct sk_buff *skb)
1636 if (skb->ip_summed == CHECKSUM_HW) {
1637 skb->ip_summed = CHECKSUM_UNNECESSARY;
1638 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1639 skb->nh.iph->daddr, skb->csum))
1640 return 0;
1642 NETDEBUG(if (net_ratelimit())
1643 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1644 skb->ip_summed = CHECKSUM_NONE;
1646 if (skb->len <= 76) {
1647 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1648 skb->nh.iph->daddr,
1649 skb_checksum(skb, 0, skb->len, 0)))
1650 return -1;
1651 skb->ip_summed = CHECKSUM_UNNECESSARY;
1652 } else {
1653 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1654 skb->nh.iph->saddr,
1655 skb->nh.iph->daddr, 0);
1657 return 0;
1661 /* The socket must have it's spinlock held when we get
1662 * here.
1664 * We have a potential double-lock case here, so even when
1665 * doing backlog processing we use the BH locking scheme.
1666 * This is because we cannot sleep with the original spinlock
1667 * held.
1669 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1671 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1672 TCP_CHECK_TIMER(sk);
1673 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1674 goto reset;
1675 TCP_CHECK_TIMER(sk);
1676 return 0;
1679 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1680 goto csum_err;
1682 if (sk->sk_state == TCP_LISTEN) {
1683 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1684 if (!nsk)
1685 goto discard;
1687 if (nsk != sk) {
1688 if (tcp_child_process(sk, nsk, skb))
1689 goto reset;
1690 return 0;
1694 TCP_CHECK_TIMER(sk);
1695 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1696 goto reset;
1697 TCP_CHECK_TIMER(sk);
1698 return 0;
1700 reset:
1701 tcp_v4_send_reset(skb);
1702 discard:
1703 kfree_skb(skb);
1704 /* Be careful here. If this function gets more complicated and
1705 * gcc suffers from register pressure on the x86, sk (in %ebx)
1706 * might be destroyed here. This current version compiles correctly,
1707 * but you have been warned.
1709 return 0;
1711 csum_err:
1712 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1713 goto discard;
1717 * From tcp_input.c
1720 int tcp_v4_rcv(struct sk_buff *skb)
1722 struct tcphdr *th;
1723 struct sock *sk;
1724 int ret;
1726 if (skb->pkt_type != PACKET_HOST)
1727 goto discard_it;
1729 /* Count it even if it's bad */
1730 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1732 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1733 goto discard_it;
1735 th = skb->h.th;
1737 if (th->doff < sizeof(struct tcphdr) / 4)
1738 goto bad_packet;
1739 if (!pskb_may_pull(skb, th->doff * 4))
1740 goto discard_it;
1742 /* An explanation is required here, I think.
1743 * Packet length and doff are validated by header prediction,
1744 * provided case of th->doff==0 is elimineted.
1745 * So, we defer the checks. */
1746 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1747 tcp_v4_checksum_init(skb) < 0))
1748 goto bad_packet;
1750 th = skb->h.th;
1751 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1752 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1753 skb->len - th->doff * 4);
1754 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1755 TCP_SKB_CB(skb)->when = 0;
1756 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1757 TCP_SKB_CB(skb)->sacked = 0;
1759 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1760 skb->nh.iph->daddr, ntohs(th->dest),
1761 tcp_v4_iif(skb));
1763 if (!sk)
1764 goto no_tcp_socket;
1766 process:
1767 if (sk->sk_state == TCP_TIME_WAIT)
1768 goto do_time_wait;
1770 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1771 goto discard_and_relse;
1773 if (sk_filter(sk, skb, 0))
1774 goto discard_and_relse;
1776 skb->dev = NULL;
1778 bh_lock_sock(sk);
1779 ret = 0;
1780 if (!sock_owned_by_user(sk)) {
1781 if (!tcp_prequeue(sk, skb))
1782 ret = tcp_v4_do_rcv(sk, skb);
1783 } else
1784 sk_add_backlog(sk, skb);
1785 bh_unlock_sock(sk);
1787 sock_put(sk);
1789 return ret;
1791 no_tcp_socket:
1792 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1793 goto discard_it;
1795 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1796 bad_packet:
1797 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1798 } else {
1799 tcp_v4_send_reset(skb);
1802 discard_it:
1803 /* Discard frame. */
1804 kfree_skb(skb);
1805 return 0;
1807 discard_and_relse:
1808 sock_put(sk);
1809 goto discard_it;
1811 do_time_wait:
1812 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1813 tcp_tw_put((struct tcp_tw_bucket *) sk);
1814 goto discard_it;
1817 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1818 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1819 tcp_tw_put((struct tcp_tw_bucket *) sk);
1820 goto discard_it;
1822 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1823 skb, th, skb->len)) {
1824 case TCP_TW_SYN: {
1825 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1826 ntohs(th->dest),
1827 tcp_v4_iif(skb));
1828 if (sk2) {
1829 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1830 tcp_tw_put((struct tcp_tw_bucket *)sk);
1831 sk = sk2;
1832 goto process;
1834 /* Fall through to ACK */
1836 case TCP_TW_ACK:
1837 tcp_v4_timewait_ack(sk, skb);
1838 break;
1839 case TCP_TW_RST:
1840 goto no_tcp_socket;
1841 case TCP_TW_SUCCESS:;
1843 goto discard_it;
1846 /* With per-bucket locks this operation is not-atomic, so that
1847 * this version is not worse.
1849 static void __tcp_v4_rehash(struct sock *sk)
1851 sk->sk_prot->unhash(sk);
1852 sk->sk_prot->hash(sk);
1855 static int tcp_v4_reselect_saddr(struct sock *sk)
1857 struct inet_sock *inet = inet_sk(sk);
1858 int err;
1859 struct rtable *rt;
1860 __u32 old_saddr = inet->saddr;
1861 __u32 new_saddr;
1862 __u32 daddr = inet->daddr;
1864 if (inet->opt && inet->opt->srr)
1865 daddr = inet->opt->faddr;
1867 /* Query new route. */
1868 err = ip_route_connect(&rt, daddr, 0,
1869 RT_CONN_FLAGS(sk),
1870 sk->sk_bound_dev_if,
1871 IPPROTO_TCP,
1872 inet->sport, inet->dport, sk);
1873 if (err)
1874 return err;
1876 __sk_dst_set(sk, &rt->u.dst);
1877 tcp_v4_setup_caps(sk, &rt->u.dst);
1879 new_saddr = rt->rt_src;
1881 if (new_saddr == old_saddr)
1882 return 0;
1884 if (sysctl_ip_dynaddr > 1) {
1885 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1886 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1887 NIPQUAD(old_saddr),
1888 NIPQUAD(new_saddr));
1891 inet->saddr = new_saddr;
1892 inet->rcv_saddr = new_saddr;
1894 /* XXX The only one ugly spot where we need to
1895 * XXX really change the sockets identity after
1896 * XXX it has entered the hashes. -DaveM
1898 * Besides that, it does not check for connection
1899 * uniqueness. Wait for troubles.
1901 __tcp_v4_rehash(sk);
1902 return 0;
1905 int tcp_v4_rebuild_header(struct sock *sk)
1907 struct inet_sock *inet = inet_sk(sk);
1908 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1909 u32 daddr;
1910 int err;
1912 /* Route is OK, nothing to do. */
1913 if (rt)
1914 return 0;
1916 /* Reroute. */
1917 daddr = inet->daddr;
1918 if (inet->opt && inet->opt->srr)
1919 daddr = inet->opt->faddr;
1922 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1923 .nl_u = { .ip4_u =
1924 { .daddr = daddr,
1925 .saddr = inet->saddr,
1926 .tos = RT_CONN_FLAGS(sk) } },
1927 .proto = IPPROTO_TCP,
1928 .uli_u = { .ports =
1929 { .sport = inet->sport,
1930 .dport = inet->dport } } };
1932 err = ip_route_output_flow(&rt, &fl, sk, 0);
1934 if (!err) {
1935 __sk_dst_set(sk, &rt->u.dst);
1936 tcp_v4_setup_caps(sk, &rt->u.dst);
1937 return 0;
1940 /* Routing failed... */
1941 sk->sk_route_caps = 0;
1943 if (!sysctl_ip_dynaddr ||
1944 sk->sk_state != TCP_SYN_SENT ||
1945 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1946 (err = tcp_v4_reselect_saddr(sk)) != 0)
1947 sk->sk_err_soft = -err;
1949 return err;
1952 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1954 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1955 struct inet_sock *inet = inet_sk(sk);
1957 sin->sin_family = AF_INET;
1958 sin->sin_addr.s_addr = inet->daddr;
1959 sin->sin_port = inet->dport;
1962 /* VJ's idea. Save last timestamp seen from this destination
1963 * and hold it at least for normal timewait interval to use for duplicate
1964 * segment detection in subsequent connections, before they enter synchronized
1965 * state.
1968 int tcp_v4_remember_stamp(struct sock *sk)
1970 struct inet_sock *inet = inet_sk(sk);
1971 struct tcp_sock *tp = tcp_sk(sk);
1972 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1973 struct inet_peer *peer = NULL;
1974 int release_it = 0;
1976 if (!rt || rt->rt_dst != inet->daddr) {
1977 peer = inet_getpeer(inet->daddr, 1);
1978 release_it = 1;
1979 } else {
1980 if (!rt->peer)
1981 rt_bind_peer(rt, 1);
1982 peer = rt->peer;
1985 if (peer) {
1986 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1987 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1988 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1989 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1990 peer->tcp_ts = tp->rx_opt.ts_recent;
1992 if (release_it)
1993 inet_putpeer(peer);
1994 return 1;
1997 return 0;
2000 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2002 struct inet_peer *peer = NULL;
2004 peer = inet_getpeer(tw->tw_daddr, 1);
2006 if (peer) {
2007 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2008 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2009 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2010 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2011 peer->tcp_ts = tw->tw_ts_recent;
2013 inet_putpeer(peer);
2014 return 1;
2017 return 0;
2020 struct tcp_func ipv4_specific = {
2021 .queue_xmit = ip_queue_xmit,
2022 .send_check = tcp_v4_send_check,
2023 .rebuild_header = tcp_v4_rebuild_header,
2024 .conn_request = tcp_v4_conn_request,
2025 .syn_recv_sock = tcp_v4_syn_recv_sock,
2026 .remember_stamp = tcp_v4_remember_stamp,
2027 .net_header_len = sizeof(struct iphdr),
2028 .setsockopt = ip_setsockopt,
2029 .getsockopt = ip_getsockopt,
2030 .addr2sockaddr = v4_addr2sockaddr,
2031 .sockaddr_len = sizeof(struct sockaddr_in),
2034 /* NOTE: A lot of things set to zero explicitly by call to
2035 * sk_alloc() so need not be done here.
2037 static int tcp_v4_init_sock(struct sock *sk)
2039 struct tcp_sock *tp = tcp_sk(sk);
2041 skb_queue_head_init(&tp->out_of_order_queue);
2042 tcp_init_xmit_timers(sk);
2043 tcp_prequeue_init(tp);
2045 tp->rto = TCP_TIMEOUT_INIT;
2046 tp->mdev = TCP_TIMEOUT_INIT;
2048 /* So many TCP implementations out there (incorrectly) count the
2049 * initial SYN frame in their delayed-ACK and congestion control
2050 * algorithms that we must have the following bandaid to talk
2051 * efficiently to them. -DaveM
2053 tp->snd_cwnd = 2;
2055 /* See draft-stevens-tcpca-spec-01 for discussion of the
2056 * initialization of these values.
2058 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2059 tp->snd_cwnd_clamp = ~0;
2060 tp->mss_cache_std = tp->mss_cache = 536;
2062 tp->reordering = sysctl_tcp_reordering;
2064 sk->sk_state = TCP_CLOSE;
2066 sk->sk_write_space = sk_stream_write_space;
2067 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2069 tp->af_specific = &ipv4_specific;
2071 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2072 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2074 atomic_inc(&tcp_sockets_allocated);
2076 return 0;
2079 int tcp_v4_destroy_sock(struct sock *sk)
2081 struct tcp_sock *tp = tcp_sk(sk);
2083 tcp_clear_xmit_timers(sk);
2085 /* Cleanup up the write buffer. */
2086 sk_stream_writequeue_purge(sk);
2088 /* Cleans up our, hopefully empty, out_of_order_queue. */
2089 __skb_queue_purge(&tp->out_of_order_queue);
2091 /* Clean prequeue, it must be empty really */
2092 __skb_queue_purge(&tp->ucopy.prequeue);
2094 /* Clean up a referenced TCP bind bucket. */
2095 if (tp->bind_hash)
2096 tcp_put_port(sk);
2099 * If sendmsg cached page exists, toss it.
2101 if (sk->sk_sndmsg_page) {
2102 __free_page(sk->sk_sndmsg_page);
2103 sk->sk_sndmsg_page = NULL;
2106 atomic_dec(&tcp_sockets_allocated);
2108 return 0;
2111 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2113 #ifdef CONFIG_PROC_FS
2114 /* Proc filesystem TCP sock list dumping. */
2116 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2118 return hlist_empty(head) ? NULL :
2119 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2122 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2124 return tw->tw_node.next ?
2125 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2128 static void *listening_get_next(struct seq_file *seq, void *cur)
2130 struct tcp_sock *tp;
2131 struct hlist_node *node;
2132 struct sock *sk = cur;
2133 struct tcp_iter_state* st = seq->private;
2135 if (!sk) {
2136 st->bucket = 0;
2137 sk = sk_head(&tcp_listening_hash[0]);
2138 goto get_sk;
2141 ++st->num;
2143 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2144 struct open_request *req = cur;
2146 tp = tcp_sk(st->syn_wait_sk);
2147 req = req->dl_next;
2148 while (1) {
2149 while (req) {
2150 if (req->class->family == st->family) {
2151 cur = req;
2152 goto out;
2154 req = req->dl_next;
2156 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2157 break;
2158 get_req:
2159 req = tp->listen_opt->syn_table[st->sbucket];
2161 sk = sk_next(st->syn_wait_sk);
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 read_unlock_bh(&tp->syn_wait_lock);
2164 } else {
2165 tp = tcp_sk(sk);
2166 read_lock_bh(&tp->syn_wait_lock);
2167 if (tp->listen_opt && tp->listen_opt->qlen)
2168 goto start_req;
2169 read_unlock_bh(&tp->syn_wait_lock);
2170 sk = sk_next(sk);
2172 get_sk:
2173 sk_for_each_from(sk, node) {
2174 if (sk->sk_family == st->family) {
2175 cur = sk;
2176 goto out;
2178 tp = tcp_sk(sk);
2179 read_lock_bh(&tp->syn_wait_lock);
2180 if (tp->listen_opt && tp->listen_opt->qlen) {
2181 start_req:
2182 st->uid = sock_i_uid(sk);
2183 st->syn_wait_sk = sk;
2184 st->state = TCP_SEQ_STATE_OPENREQ;
2185 st->sbucket = 0;
2186 goto get_req;
2188 read_unlock_bh(&tp->syn_wait_lock);
2190 if (++st->bucket < TCP_LHTABLE_SIZE) {
2191 sk = sk_head(&tcp_listening_hash[st->bucket]);
2192 goto get_sk;
2194 cur = NULL;
2195 out:
2196 return cur;
2199 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2201 void *rc = listening_get_next(seq, NULL);
2203 while (rc && *pos) {
2204 rc = listening_get_next(seq, rc);
2205 --*pos;
2207 return rc;
2210 static void *established_get_first(struct seq_file *seq)
2212 struct tcp_iter_state* st = seq->private;
2213 void *rc = NULL;
2215 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2216 struct sock *sk;
2217 struct hlist_node *node;
2218 struct tcp_tw_bucket *tw;
2220 /* We can reschedule _before_ having picked the target: */
2221 cond_resched_softirq();
2223 read_lock(&tcp_ehash[st->bucket].lock);
2224 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2225 if (sk->sk_family != st->family) {
2226 continue;
2228 rc = sk;
2229 goto out;
2231 st->state = TCP_SEQ_STATE_TIME_WAIT;
2232 tw_for_each(tw, node,
2233 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2234 if (tw->tw_family != st->family) {
2235 continue;
2237 rc = tw;
2238 goto out;
2240 read_unlock(&tcp_ehash[st->bucket].lock);
2241 st->state = TCP_SEQ_STATE_ESTABLISHED;
2243 out:
2244 return rc;
2247 static void *established_get_next(struct seq_file *seq, void *cur)
2249 struct sock *sk = cur;
2250 struct tcp_tw_bucket *tw;
2251 struct hlist_node *node;
2252 struct tcp_iter_state* st = seq->private;
2254 ++st->num;
2256 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2257 tw = cur;
2258 tw = tw_next(tw);
2259 get_tw:
2260 while (tw && tw->tw_family != st->family) {
2261 tw = tw_next(tw);
2263 if (tw) {
2264 cur = tw;
2265 goto out;
2267 read_unlock(&tcp_ehash[st->bucket].lock);
2268 st->state = TCP_SEQ_STATE_ESTABLISHED;
2270 /* We can reschedule between buckets: */
2271 cond_resched_softirq();
2273 if (++st->bucket < tcp_ehash_size) {
2274 read_lock(&tcp_ehash[st->bucket].lock);
2275 sk = sk_head(&tcp_ehash[st->bucket].chain);
2276 } else {
2277 cur = NULL;
2278 goto out;
2280 } else
2281 sk = sk_next(sk);
2283 sk_for_each_from(sk, node) {
2284 if (sk->sk_family == st->family)
2285 goto found;
2288 st->state = TCP_SEQ_STATE_TIME_WAIT;
2289 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2290 goto get_tw;
2291 found:
2292 cur = sk;
2293 out:
2294 return cur;
2297 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2299 void *rc = established_get_first(seq);
2301 while (rc && pos) {
2302 rc = established_get_next(seq, rc);
2303 --pos;
2305 return rc;
2308 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2310 void *rc;
2311 struct tcp_iter_state* st = seq->private;
2313 tcp_listen_lock();
2314 st->state = TCP_SEQ_STATE_LISTENING;
2315 rc = listening_get_idx(seq, &pos);
2317 if (!rc) {
2318 tcp_listen_unlock();
2319 local_bh_disable();
2320 st->state = TCP_SEQ_STATE_ESTABLISHED;
2321 rc = established_get_idx(seq, pos);
2324 return rc;
2327 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2329 struct tcp_iter_state* st = seq->private;
2330 st->state = TCP_SEQ_STATE_LISTENING;
2331 st->num = 0;
2332 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2335 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2337 void *rc = NULL;
2338 struct tcp_iter_state* st;
2340 if (v == SEQ_START_TOKEN) {
2341 rc = tcp_get_idx(seq, 0);
2342 goto out;
2344 st = seq->private;
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_OPENREQ:
2348 case TCP_SEQ_STATE_LISTENING:
2349 rc = listening_get_next(seq, v);
2350 if (!rc) {
2351 tcp_listen_unlock();
2352 local_bh_disable();
2353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_first(seq);
2356 break;
2357 case TCP_SEQ_STATE_ESTABLISHED:
2358 case TCP_SEQ_STATE_TIME_WAIT:
2359 rc = established_get_next(seq, v);
2360 break;
2362 out:
2363 ++*pos;
2364 return rc;
2367 static void tcp_seq_stop(struct seq_file *seq, void *v)
2369 struct tcp_iter_state* st = seq->private;
2371 switch (st->state) {
2372 case TCP_SEQ_STATE_OPENREQ:
2373 if (v) {
2374 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2375 read_unlock_bh(&tp->syn_wait_lock);
2377 case TCP_SEQ_STATE_LISTENING:
2378 if (v != SEQ_START_TOKEN)
2379 tcp_listen_unlock();
2380 break;
2381 case TCP_SEQ_STATE_TIME_WAIT:
2382 case TCP_SEQ_STATE_ESTABLISHED:
2383 if (v)
2384 read_unlock(&tcp_ehash[st->bucket].lock);
2385 local_bh_enable();
2386 break;
2390 static int tcp_seq_open(struct inode *inode, struct file *file)
2392 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2393 struct seq_file *seq;
2394 struct tcp_iter_state *s;
2395 int rc;
2397 if (unlikely(afinfo == NULL))
2398 return -EINVAL;
2400 s = kmalloc(sizeof(*s), GFP_KERNEL);
2401 if (!s)
2402 return -ENOMEM;
2403 memset(s, 0, sizeof(*s));
2404 s->family = afinfo->family;
2405 s->seq_ops.start = tcp_seq_start;
2406 s->seq_ops.next = tcp_seq_next;
2407 s->seq_ops.show = afinfo->seq_show;
2408 s->seq_ops.stop = tcp_seq_stop;
2410 rc = seq_open(file, &s->seq_ops);
2411 if (rc)
2412 goto out_kfree;
2413 seq = file->private_data;
2414 seq->private = s;
2415 out:
2416 return rc;
2417 out_kfree:
2418 kfree(s);
2419 goto out;
2422 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2424 int rc = 0;
2425 struct proc_dir_entry *p;
2427 if (!afinfo)
2428 return -EINVAL;
2429 afinfo->seq_fops->owner = afinfo->owner;
2430 afinfo->seq_fops->open = tcp_seq_open;
2431 afinfo->seq_fops->read = seq_read;
2432 afinfo->seq_fops->llseek = seq_lseek;
2433 afinfo->seq_fops->release = seq_release_private;
2435 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2436 if (p)
2437 p->data = afinfo;
2438 else
2439 rc = -ENOMEM;
2440 return rc;
2443 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2445 if (!afinfo)
2446 return;
2447 proc_net_remove(afinfo->name);
2448 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2451 static void get_openreq4(struct sock *sk, struct open_request *req,
2452 char *tmpbuf, int i, int uid)
2454 int ttd = req->expires - jiffies;
2456 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2457 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2459 req->af.v4_req.loc_addr,
2460 ntohs(inet_sk(sk)->sport),
2461 req->af.v4_req.rmt_addr,
2462 ntohs(req->rmt_port),
2463 TCP_SYN_RECV,
2464 0, 0, /* could print option size, but that is af dependent. */
2465 1, /* timers active (only the expire timer) */
2466 jiffies_to_clock_t(ttd),
2467 req->retrans,
2468 uid,
2469 0, /* non standard timer */
2470 0, /* open_requests have no inode */
2471 atomic_read(&sk->sk_refcnt),
2472 req);
2475 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2477 int timer_active;
2478 unsigned long timer_expires;
2479 struct tcp_sock *tp = tcp_sk(sp);
2480 struct inet_sock *inet = inet_sk(sp);
2481 unsigned int dest = inet->daddr;
2482 unsigned int src = inet->rcv_saddr;
2483 __u16 destp = ntohs(inet->dport);
2484 __u16 srcp = ntohs(inet->sport);
2486 if (tp->pending == TCP_TIME_RETRANS) {
2487 timer_active = 1;
2488 timer_expires = tp->timeout;
2489 } else if (tp->pending == TCP_TIME_PROBE0) {
2490 timer_active = 4;
2491 timer_expires = tp->timeout;
2492 } else if (timer_pending(&sp->sk_timer)) {
2493 timer_active = 2;
2494 timer_expires = sp->sk_timer.expires;
2495 } else {
2496 timer_active = 0;
2497 timer_expires = jiffies;
2500 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2501 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2502 i, src, srcp, dest, destp, sp->sk_state,
2503 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2504 timer_active,
2505 jiffies_to_clock_t(timer_expires - jiffies),
2506 tp->retransmits,
2507 sock_i_uid(sp),
2508 tp->probes_out,
2509 sock_i_ino(sp),
2510 atomic_read(&sp->sk_refcnt), sp,
2511 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2512 tp->snd_cwnd,
2513 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2516 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2518 unsigned int dest, src;
2519 __u16 destp, srcp;
2520 int ttd = tw->tw_ttd - jiffies;
2522 if (ttd < 0)
2523 ttd = 0;
2525 dest = tw->tw_daddr;
2526 src = tw->tw_rcv_saddr;
2527 destp = ntohs(tw->tw_dport);
2528 srcp = ntohs(tw->tw_sport);
2530 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2531 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2532 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2533 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2534 atomic_read(&tw->tw_refcnt), tw);
2537 #define TMPSZ 150
2539 static int tcp4_seq_show(struct seq_file *seq, void *v)
2541 struct tcp_iter_state* st;
2542 char tmpbuf[TMPSZ + 1];
2544 if (v == SEQ_START_TOKEN) {
2545 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2546 " sl local_address rem_address st tx_queue "
2547 "rx_queue tr tm->when retrnsmt uid timeout "
2548 "inode");
2549 goto out;
2551 st = seq->private;
2553 switch (st->state) {
2554 case TCP_SEQ_STATE_LISTENING:
2555 case TCP_SEQ_STATE_ESTABLISHED:
2556 get_tcp4_sock(v, tmpbuf, st->num);
2557 break;
2558 case TCP_SEQ_STATE_OPENREQ:
2559 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2560 break;
2561 case TCP_SEQ_STATE_TIME_WAIT:
2562 get_timewait4_sock(v, tmpbuf, st->num);
2563 break;
2565 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2566 out:
2567 return 0;
2570 static struct file_operations tcp4_seq_fops;
2571 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2572 .owner = THIS_MODULE,
2573 .name = "tcp",
2574 .family = AF_INET,
2575 .seq_show = tcp4_seq_show,
2576 .seq_fops = &tcp4_seq_fops,
2579 int __init tcp4_proc_init(void)
2581 return tcp_proc_register(&tcp4_seq_afinfo);
2584 void tcp4_proc_exit(void)
2586 tcp_proc_unregister(&tcp4_seq_afinfo);
2588 #endif /* CONFIG_PROC_FS */
2590 struct proto tcp_prot = {
2591 .name = "TCP",
2592 .owner = THIS_MODULE,
2593 .close = tcp_close,
2594 .connect = tcp_v4_connect,
2595 .disconnect = tcp_disconnect,
2596 .accept = tcp_accept,
2597 .ioctl = tcp_ioctl,
2598 .init = tcp_v4_init_sock,
2599 .destroy = tcp_v4_destroy_sock,
2600 .shutdown = tcp_shutdown,
2601 .setsockopt = tcp_setsockopt,
2602 .getsockopt = tcp_getsockopt,
2603 .sendmsg = tcp_sendmsg,
2604 .recvmsg = tcp_recvmsg,
2605 .backlog_rcv = tcp_v4_do_rcv,
2606 .hash = tcp_v4_hash,
2607 .unhash = tcp_unhash,
2608 .get_port = tcp_v4_get_port,
2609 .enter_memory_pressure = tcp_enter_memory_pressure,
2610 .sockets_allocated = &tcp_sockets_allocated,
2611 .memory_allocated = &tcp_memory_allocated,
2612 .memory_pressure = &tcp_memory_pressure,
2613 .sysctl_mem = sysctl_tcp_mem,
2614 .sysctl_wmem = sysctl_tcp_wmem,
2615 .sysctl_rmem = sysctl_tcp_rmem,
2616 .max_header = MAX_TCP_HEADER,
2617 .obj_size = sizeof(struct tcp_sock),
2622 void __init tcp_v4_init(struct net_proto_family *ops)
2624 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2625 if (err < 0)
2626 panic("Failed to create the TCP control socket.\n");
2627 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2628 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2630 /* Unhash it so that IP input processing does not even
2631 * see it, we do not wish this socket to see incoming
2632 * packets.
2634 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2637 EXPORT_SYMBOL(ipv4_specific);
2638 EXPORT_SYMBOL(tcp_bind_hash);
2639 EXPORT_SYMBOL(tcp_bucket_create);
2640 EXPORT_SYMBOL(tcp_hashinfo);
2641 EXPORT_SYMBOL(tcp_inherit_port);
2642 EXPORT_SYMBOL(tcp_listen_wlock);
2643 EXPORT_SYMBOL(tcp_port_rover);
2644 EXPORT_SYMBOL(tcp_prot);
2645 EXPORT_SYMBOL(tcp_put_port);
2646 EXPORT_SYMBOL(tcp_unhash);
2647 EXPORT_SYMBOL(tcp_v4_conn_request);
2648 EXPORT_SYMBOL(tcp_v4_connect);
2649 EXPORT_SYMBOL(tcp_v4_do_rcv);
2650 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2651 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2652 EXPORT_SYMBOL(tcp_v4_send_check);
2653 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2655 #ifdef CONFIG_PROC_FS
2656 EXPORT_SYMBOL(tcp_proc_register);
2657 EXPORT_SYMBOL(tcp_proc_unregister);
2658 #endif
2659 EXPORT_SYMBOL(sysctl_local_port_range);
2660 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2661 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2662 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);