[INET]: Generalise tcp_v4_hash & tcp_unhash
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
blobdca1be67164b0c1d3555d09e29283e835a1c8038
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
129 return node != NULL;
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
140 int ret;
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
151 rover = low;
152 else
153 rover = tcp_hashinfo.port_rover;
154 do {
155 rover++;
156 if (rover > high)
157 rover = low;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
176 ret = 1;
177 if (unlikely(remaining <= 0))
178 goto fail;
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
183 snum = rover;
184 } else {
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
189 goto tb_found;
191 tb = NULL;
192 goto tb_not_found;
193 tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
206 tb_not_found:
207 ret = 1;
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218 success:
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222 ret = 0;
224 fail_unlock:
225 spin_unlock(&head->lock);
226 fail:
227 local_bh_enable();
228 return ret;
231 static void tcp_v4_hash(struct sock *sk)
233 inet_hash(&tcp_hashinfo, sk);
236 void tcp_unhash(struct sock *sk)
238 inet_unhash(&tcp_hashinfo, sk);
241 /* Don't inline this cruft. Here are some nice properties to
242 * exploit here. The BSD API does not allow a listening TCP
243 * to specify the remote port nor the remote address for the
244 * connection. So always assume those are both wildcarded
245 * during the search since they can never be otherwise.
247 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
248 const u32 daddr,
249 const unsigned short hnum,
250 const int dif)
252 struct sock *result = NULL, *sk;
253 struct hlist_node *node;
254 int score, hiscore;
256 hiscore=-1;
257 sk_for_each(sk, node, head) {
258 struct inet_sock *inet = inet_sk(sk);
260 if (inet->num == hnum && !ipv6_only_sock(sk)) {
261 __u32 rcv_saddr = inet->rcv_saddr;
263 score = (sk->sk_family == PF_INET ? 1 : 0);
264 if (rcv_saddr) {
265 if (rcv_saddr != daddr)
266 continue;
267 score+=2;
269 if (sk->sk_bound_dev_if) {
270 if (sk->sk_bound_dev_if != dif)
271 continue;
272 score+=2;
274 if (score == 5)
275 return sk;
276 if (score > hiscore) {
277 hiscore = score;
278 result = sk;
282 return result;
285 /* Optimize the common listener case. */
286 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
287 const unsigned short hnum,
288 const int dif)
290 struct sock *sk = NULL;
291 struct hlist_head *head;
293 read_lock(&tcp_hashinfo.lhash_lock);
294 head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
295 if (!hlist_empty(head)) {
296 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
298 if (inet->num == hnum && !sk->sk_node.next &&
299 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
300 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
301 !sk->sk_bound_dev_if)
302 goto sherry_cache;
303 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
305 if (sk) {
306 sherry_cache:
307 sock_hold(sk);
309 read_unlock(&tcp_hashinfo.lhash_lock);
310 return sk;
313 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
314 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
316 * Local BH must be disabled here.
319 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
320 const u16 sport,
321 const u32 daddr,
322 const u16 hnum,
323 const int dif)
325 struct inet_ehash_bucket *head;
326 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
327 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
328 struct sock *sk;
329 struct hlist_node *node;
330 /* Optimize here for direct hit, only listening connections can
331 * have wildcards anyways.
333 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
334 head = &tcp_hashinfo.ehash[hash];
335 read_lock(&head->lock);
336 sk_for_each(sk, node, &head->chain) {
337 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
338 goto hit; /* You sunk my battleship! */
341 /* Must check for a TIME_WAIT'er before going to listener hash. */
342 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
343 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
344 goto hit;
346 sk = NULL;
347 out:
348 read_unlock(&head->lock);
349 return sk;
350 hit:
351 sock_hold(sk);
352 goto out;
355 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
356 u32 daddr, u16 hnum, int dif)
358 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
359 daddr, hnum, dif);
361 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
364 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
365 u16 dport, int dif)
367 struct sock *sk;
369 local_bh_disable();
370 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
371 local_bh_enable();
373 return sk;
376 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
378 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
380 return secure_tcp_sequence_number(skb->nh.iph->daddr,
381 skb->nh.iph->saddr,
382 skb->h.th->dest,
383 skb->h.th->source);
386 /* called with local bh disabled */
387 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
388 struct tcp_tw_bucket **twp)
390 struct inet_sock *inet = inet_sk(sk);
391 u32 daddr = inet->rcv_saddr;
392 u32 saddr = inet->daddr;
393 int dif = sk->sk_bound_dev_if;
394 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
395 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
396 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
397 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
398 struct sock *sk2;
399 struct hlist_node *node;
400 struct tcp_tw_bucket *tw;
402 write_lock(&head->lock);
404 /* Check TIME-WAIT sockets first. */
405 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
406 tw = (struct tcp_tw_bucket *)sk2;
408 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
409 struct tcp_sock *tp = tcp_sk(sk);
411 /* With PAWS, it is safe from the viewpoint
412 of data integrity. Even without PAWS it
413 is safe provided sequence spaces do not
414 overlap i.e. at data rates <= 80Mbit/sec.
416 Actually, the idea is close to VJ's one,
417 only timestamp cache is held not per host,
418 but per port pair and TW bucket is used
419 as state holder.
421 If TW bucket has been already destroyed we
422 fall back to VJ's scheme and use initial
423 timestamp retrieved from peer table.
425 if (tw->tw_ts_recent_stamp &&
426 (!twp || (sysctl_tcp_tw_reuse &&
427 xtime.tv_sec -
428 tw->tw_ts_recent_stamp > 1))) {
429 if ((tp->write_seq =
430 tw->tw_snd_nxt + 65535 + 2) == 0)
431 tp->write_seq = 1;
432 tp->rx_opt.ts_recent = tw->tw_ts_recent;
433 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
434 sock_hold(sk2);
435 goto unique;
436 } else
437 goto not_unique;
440 tw = NULL;
442 /* And established part... */
443 sk_for_each(sk2, node, &head->chain) {
444 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
445 goto not_unique;
448 unique:
449 /* Must record num and sport now. Otherwise we will see
450 * in hash table socket with a funny identity. */
451 inet->num = lport;
452 inet->sport = htons(lport);
453 sk->sk_hashent = hash;
454 BUG_TRAP(sk_unhashed(sk));
455 __sk_add_node(sk, &head->chain);
456 sock_prot_inc_use(sk->sk_prot);
457 write_unlock(&head->lock);
459 if (twp) {
460 *twp = tw;
461 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
462 } else if (tw) {
463 /* Silly. Should hash-dance instead... */
464 tcp_tw_deschedule(tw);
465 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
467 tcp_tw_put(tw);
470 return 0;
472 not_unique:
473 write_unlock(&head->lock);
474 return -EADDRNOTAVAIL;
477 static inline u32 connect_port_offset(const struct sock *sk)
479 const struct inet_sock *inet = inet_sk(sk);
481 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
482 inet->dport);
486 * Bind a port for a connect operation and hash it.
488 static inline int tcp_v4_hash_connect(struct sock *sk)
490 const unsigned short snum = inet_sk(sk)->num;
491 struct inet_bind_hashbucket *head;
492 struct inet_bind_bucket *tb;
493 int ret;
495 if (!snum) {
496 int low = sysctl_local_port_range[0];
497 int high = sysctl_local_port_range[1];
498 int range = high - low;
499 int i;
500 int port;
501 static u32 hint;
502 u32 offset = hint + connect_port_offset(sk);
503 struct hlist_node *node;
504 struct tcp_tw_bucket *tw = NULL;
506 local_bh_disable();
507 for (i = 1; i <= range; i++) {
508 port = low + (i + offset) % range;
509 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
510 spin_lock(&head->lock);
512 /* Does not bother with rcv_saddr checks,
513 * because the established check is already
514 * unique enough.
516 inet_bind_bucket_for_each(tb, node, &head->chain) {
517 if (tb->port == port) {
518 BUG_TRAP(!hlist_empty(&tb->owners));
519 if (tb->fastreuse >= 0)
520 goto next_port;
521 if (!__tcp_v4_check_established(sk,
522 port,
523 &tw))
524 goto ok;
525 goto next_port;
529 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
530 if (!tb) {
531 spin_unlock(&head->lock);
532 break;
534 tb->fastreuse = -1;
535 goto ok;
537 next_port:
538 spin_unlock(&head->lock);
540 local_bh_enable();
542 return -EADDRNOTAVAIL;
545 hint += i;
547 /* Head lock still held and bh's disabled */
548 inet_bind_hash(sk, tb, port);
549 if (sk_unhashed(sk)) {
550 inet_sk(sk)->sport = htons(port);
551 __inet_hash(&tcp_hashinfo, sk, 0);
553 spin_unlock(&head->lock);
555 if (tw) {
556 tcp_tw_deschedule(tw);
557 tcp_tw_put(tw);
560 ret = 0;
561 goto out;
564 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
565 tb = inet_sk(sk)->bind_hash;
566 spin_lock_bh(&head->lock);
567 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
568 __inet_hash(&tcp_hashinfo, sk, 0);
569 spin_unlock_bh(&head->lock);
570 return 0;
571 } else {
572 spin_unlock(&head->lock);
573 /* No definite answer... Walk to established hash table */
574 ret = __tcp_v4_check_established(sk, snum, NULL);
575 out:
576 local_bh_enable();
577 return ret;
581 /* This will initiate an outgoing connection. */
582 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
584 struct inet_sock *inet = inet_sk(sk);
585 struct tcp_sock *tp = tcp_sk(sk);
586 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
587 struct rtable *rt;
588 u32 daddr, nexthop;
589 int tmp;
590 int err;
592 if (addr_len < sizeof(struct sockaddr_in))
593 return -EINVAL;
595 if (usin->sin_family != AF_INET)
596 return -EAFNOSUPPORT;
598 nexthop = daddr = usin->sin_addr.s_addr;
599 if (inet->opt && inet->opt->srr) {
600 if (!daddr)
601 return -EINVAL;
602 nexthop = inet->opt->faddr;
605 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
606 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
607 IPPROTO_TCP,
608 inet->sport, usin->sin_port, sk);
609 if (tmp < 0)
610 return tmp;
612 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
613 ip_rt_put(rt);
614 return -ENETUNREACH;
617 if (!inet->opt || !inet->opt->srr)
618 daddr = rt->rt_dst;
620 if (!inet->saddr)
621 inet->saddr = rt->rt_src;
622 inet->rcv_saddr = inet->saddr;
624 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
625 /* Reset inherited state */
626 tp->rx_opt.ts_recent = 0;
627 tp->rx_opt.ts_recent_stamp = 0;
628 tp->write_seq = 0;
631 if (sysctl_tcp_tw_recycle &&
632 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
633 struct inet_peer *peer = rt_get_peer(rt);
635 /* VJ's idea. We save last timestamp seen from
636 * the destination in peer table, when entering state TIME-WAIT
637 * and initialize rx_opt.ts_recent from it, when trying new connection.
640 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
641 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
642 tp->rx_opt.ts_recent = peer->tcp_ts;
646 inet->dport = usin->sin_port;
647 inet->daddr = daddr;
649 tp->ext_header_len = 0;
650 if (inet->opt)
651 tp->ext_header_len = inet->opt->optlen;
653 tp->rx_opt.mss_clamp = 536;
655 /* Socket identity is still unknown (sport may be zero).
656 * However we set state to SYN-SENT and not releasing socket
657 * lock select source port, enter ourselves into the hash tables and
658 * complete initialization after this.
660 tcp_set_state(sk, TCP_SYN_SENT);
661 err = tcp_v4_hash_connect(sk);
662 if (err)
663 goto failure;
665 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
666 if (err)
667 goto failure;
669 /* OK, now commit destination to socket. */
670 sk_setup_caps(sk, &rt->u.dst);
672 if (!tp->write_seq)
673 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
674 inet->daddr,
675 inet->sport,
676 usin->sin_port);
678 inet->id = tp->write_seq ^ jiffies;
680 err = tcp_connect(sk);
681 rt = NULL;
682 if (err)
683 goto failure;
685 return 0;
687 failure:
688 /* This unhashes the socket and releases the local port, if necessary. */
689 tcp_set_state(sk, TCP_CLOSE);
690 ip_rt_put(rt);
691 sk->sk_route_caps = 0;
692 inet->dport = 0;
693 return err;
696 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
698 return ((struct rtable *)skb->dst)->rt_iif;
701 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
703 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
706 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
707 struct request_sock ***prevp,
708 __u16 rport,
709 __u32 raddr, __u32 laddr)
711 struct listen_sock *lopt = tp->accept_queue.listen_opt;
712 struct request_sock *req, **prev;
714 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
715 (req = *prev) != NULL;
716 prev = &req->dl_next) {
717 const struct inet_request_sock *ireq = inet_rsk(req);
719 if (ireq->rmt_port == rport &&
720 ireq->rmt_addr == raddr &&
721 ireq->loc_addr == laddr &&
722 TCP_INET_FAMILY(req->rsk_ops->family)) {
723 BUG_TRAP(!req->sk);
724 *prevp = prev;
725 break;
729 return req;
732 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
734 struct tcp_sock *tp = tcp_sk(sk);
735 struct listen_sock *lopt = tp->accept_queue.listen_opt;
736 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
738 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
739 tcp_synq_added(sk);
744 * This routine does path mtu discovery as defined in RFC1191.
746 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
747 u32 mtu)
749 struct dst_entry *dst;
750 struct inet_sock *inet = inet_sk(sk);
751 struct tcp_sock *tp = tcp_sk(sk);
753 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
754 * send out by Linux are always <576bytes so they should go through
755 * unfragmented).
757 if (sk->sk_state == TCP_LISTEN)
758 return;
760 /* We don't check in the destentry if pmtu discovery is forbidden
761 * on this route. We just assume that no packet_to_big packets
762 * are send back when pmtu discovery is not active.
763 * There is a small race when the user changes this flag in the
764 * route, but I think that's acceptable.
766 if ((dst = __sk_dst_check(sk, 0)) == NULL)
767 return;
769 dst->ops->update_pmtu(dst, mtu);
771 /* Something is about to be wrong... Remember soft error
772 * for the case, if this connection will not able to recover.
774 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
775 sk->sk_err_soft = EMSGSIZE;
777 mtu = dst_mtu(dst);
779 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
780 tp->pmtu_cookie > mtu) {
781 tcp_sync_mss(sk, mtu);
783 /* Resend the TCP packet because it's
784 * clear that the old packet has been
785 * dropped. This is the new "fast" path mtu
786 * discovery.
788 tcp_simple_retransmit(sk);
789 } /* else let the usual retransmit timer handle it */
793 * This routine is called by the ICMP module when it gets some
794 * sort of error condition. If err < 0 then the socket should
795 * be closed and the error returned to the user. If err > 0
796 * it's just the icmp type << 8 | icmp code. After adjustment
797 * header points to the first 8 bytes of the tcp header. We need
798 * to find the appropriate port.
800 * The locking strategy used here is very "optimistic". When
801 * someone else accesses the socket the ICMP is just dropped
802 * and for some paths there is no check at all.
803 * A more general error queue to queue errors for later handling
804 * is probably better.
808 void tcp_v4_err(struct sk_buff *skb, u32 info)
810 struct iphdr *iph = (struct iphdr *)skb->data;
811 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
812 struct tcp_sock *tp;
813 struct inet_sock *inet;
814 int type = skb->h.icmph->type;
815 int code = skb->h.icmph->code;
816 struct sock *sk;
817 __u32 seq;
818 int err;
820 if (skb->len < (iph->ihl << 2) + 8) {
821 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
822 return;
825 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
826 th->source, tcp_v4_iif(skb));
827 if (!sk) {
828 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
829 return;
831 if (sk->sk_state == TCP_TIME_WAIT) {
832 tcp_tw_put((struct tcp_tw_bucket *)sk);
833 return;
836 bh_lock_sock(sk);
837 /* If too many ICMPs get dropped on busy
838 * servers this needs to be solved differently.
840 if (sock_owned_by_user(sk))
841 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
843 if (sk->sk_state == TCP_CLOSE)
844 goto out;
846 tp = tcp_sk(sk);
847 seq = ntohl(th->seq);
848 if (sk->sk_state != TCP_LISTEN &&
849 !between(seq, tp->snd_una, tp->snd_nxt)) {
850 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
851 goto out;
854 switch (type) {
855 case ICMP_SOURCE_QUENCH:
856 /* Just silently ignore these. */
857 goto out;
858 case ICMP_PARAMETERPROB:
859 err = EPROTO;
860 break;
861 case ICMP_DEST_UNREACH:
862 if (code > NR_ICMP_UNREACH)
863 goto out;
865 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
866 if (!sock_owned_by_user(sk))
867 do_pmtu_discovery(sk, iph, info);
868 goto out;
871 err = icmp_err_convert[code].errno;
872 break;
873 case ICMP_TIME_EXCEEDED:
874 err = EHOSTUNREACH;
875 break;
876 default:
877 goto out;
880 switch (sk->sk_state) {
881 struct request_sock *req, **prev;
882 case TCP_LISTEN:
883 if (sock_owned_by_user(sk))
884 goto out;
886 req = tcp_v4_search_req(tp, &prev, th->dest,
887 iph->daddr, iph->saddr);
888 if (!req)
889 goto out;
891 /* ICMPs are not backlogged, hence we cannot get
892 an established socket here.
894 BUG_TRAP(!req->sk);
896 if (seq != tcp_rsk(req)->snt_isn) {
897 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
898 goto out;
902 * Still in SYN_RECV, just remove it silently.
903 * There is no good way to pass the error to the newly
904 * created socket, and POSIX does not want network
905 * errors returned from accept().
907 tcp_synq_drop(sk, req, prev);
908 goto out;
910 case TCP_SYN_SENT:
911 case TCP_SYN_RECV: /* Cannot happen.
912 It can f.e. if SYNs crossed.
914 if (!sock_owned_by_user(sk)) {
915 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
916 sk->sk_err = err;
918 sk->sk_error_report(sk);
920 tcp_done(sk);
921 } else {
922 sk->sk_err_soft = err;
924 goto out;
927 /* If we've already connected we will keep trying
928 * until we time out, or the user gives up.
930 * rfc1122 4.2.3.9 allows to consider as hard errors
931 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
932 * but it is obsoleted by pmtu discovery).
934 * Note, that in modern internet, where routing is unreliable
935 * and in each dark corner broken firewalls sit, sending random
936 * errors ordered by their masters even this two messages finally lose
937 * their original sense (even Linux sends invalid PORT_UNREACHs)
939 * Now we are in compliance with RFCs.
940 * --ANK (980905)
943 inet = inet_sk(sk);
944 if (!sock_owned_by_user(sk) && inet->recverr) {
945 sk->sk_err = err;
946 sk->sk_error_report(sk);
947 } else { /* Only an error on timeout */
948 sk->sk_err_soft = err;
951 out:
952 bh_unlock_sock(sk);
953 sock_put(sk);
956 /* This routine computes an IPv4 TCP checksum. */
957 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
958 struct sk_buff *skb)
960 struct inet_sock *inet = inet_sk(sk);
962 if (skb->ip_summed == CHECKSUM_HW) {
963 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
964 skb->csum = offsetof(struct tcphdr, check);
965 } else {
966 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
967 csum_partial((char *)th,
968 th->doff << 2,
969 skb->csum));
974 * This routine will send an RST to the other tcp.
976 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
977 * for reset.
978 * Answer: if a packet caused RST, it is not for a socket
979 * existing in our system, if it is matched to a socket,
980 * it is just duplicate segment or bug in other side's TCP.
981 * So that we build reply only basing on parameters
982 * arrived with segment.
983 * Exception: precedence violation. We do not implement it in any case.
986 static void tcp_v4_send_reset(struct sk_buff *skb)
988 struct tcphdr *th = skb->h.th;
989 struct tcphdr rth;
990 struct ip_reply_arg arg;
992 /* Never send a reset in response to a reset. */
993 if (th->rst)
994 return;
996 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
997 return;
999 /* Swap the send and the receive. */
1000 memset(&rth, 0, sizeof(struct tcphdr));
1001 rth.dest = th->source;
1002 rth.source = th->dest;
1003 rth.doff = sizeof(struct tcphdr) / 4;
1004 rth.rst = 1;
1006 if (th->ack) {
1007 rth.seq = th->ack_seq;
1008 } else {
1009 rth.ack = 1;
1010 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1011 skb->len - (th->doff << 2));
1014 memset(&arg, 0, sizeof arg);
1015 arg.iov[0].iov_base = (unsigned char *)&rth;
1016 arg.iov[0].iov_len = sizeof rth;
1017 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1018 skb->nh.iph->saddr, /*XXX*/
1019 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1020 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1022 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1024 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1025 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1028 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1029 outside socket context is ugly, certainly. What can I do?
1032 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1033 u32 win, u32 ts)
1035 struct tcphdr *th = skb->h.th;
1036 struct {
1037 struct tcphdr th;
1038 u32 tsopt[3];
1039 } rep;
1040 struct ip_reply_arg arg;
1042 memset(&rep.th, 0, sizeof(struct tcphdr));
1043 memset(&arg, 0, sizeof arg);
1045 arg.iov[0].iov_base = (unsigned char *)&rep;
1046 arg.iov[0].iov_len = sizeof(rep.th);
1047 if (ts) {
1048 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1049 (TCPOPT_TIMESTAMP << 8) |
1050 TCPOLEN_TIMESTAMP);
1051 rep.tsopt[1] = htonl(tcp_time_stamp);
1052 rep.tsopt[2] = htonl(ts);
1053 arg.iov[0].iov_len = sizeof(rep);
1056 /* Swap the send and the receive. */
1057 rep.th.dest = th->source;
1058 rep.th.source = th->dest;
1059 rep.th.doff = arg.iov[0].iov_len / 4;
1060 rep.th.seq = htonl(seq);
1061 rep.th.ack_seq = htonl(ack);
1062 rep.th.ack = 1;
1063 rep.th.window = htons(win);
1065 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1066 skb->nh.iph->saddr, /*XXX*/
1067 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1068 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1070 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1072 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1075 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1077 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1079 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1080 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1082 tcp_tw_put(tw);
1085 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1087 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1088 req->ts_recent);
1091 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1092 struct request_sock *req)
1094 struct rtable *rt;
1095 const struct inet_request_sock *ireq = inet_rsk(req);
1096 struct ip_options *opt = inet_rsk(req)->opt;
1097 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1098 .nl_u = { .ip4_u =
1099 { .daddr = ((opt && opt->srr) ?
1100 opt->faddr :
1101 ireq->rmt_addr),
1102 .saddr = ireq->loc_addr,
1103 .tos = RT_CONN_FLAGS(sk) } },
1104 .proto = IPPROTO_TCP,
1105 .uli_u = { .ports =
1106 { .sport = inet_sk(sk)->sport,
1107 .dport = ireq->rmt_port } } };
1109 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1110 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1111 return NULL;
1113 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1114 ip_rt_put(rt);
1115 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1116 return NULL;
1118 return &rt->u.dst;
1122 * Send a SYN-ACK after having received an ACK.
1123 * This still operates on a request_sock only, not on a big
1124 * socket.
1126 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1127 struct dst_entry *dst)
1129 const struct inet_request_sock *ireq = inet_rsk(req);
1130 int err = -1;
1131 struct sk_buff * skb;
1133 /* First, grab a route. */
1134 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1135 goto out;
1137 skb = tcp_make_synack(sk, dst, req);
1139 if (skb) {
1140 struct tcphdr *th = skb->h.th;
1142 th->check = tcp_v4_check(th, skb->len,
1143 ireq->loc_addr,
1144 ireq->rmt_addr,
1145 csum_partial((char *)th, skb->len,
1146 skb->csum));
1148 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1149 ireq->rmt_addr,
1150 ireq->opt);
1151 if (err == NET_XMIT_CN)
1152 err = 0;
1155 out:
1156 dst_release(dst);
1157 return err;
1161 * IPv4 request_sock destructor.
1163 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1165 if (inet_rsk(req)->opt)
1166 kfree(inet_rsk(req)->opt);
1169 static inline void syn_flood_warning(struct sk_buff *skb)
1171 static unsigned long warntime;
1173 if (time_after(jiffies, (warntime + HZ * 60))) {
1174 warntime = jiffies;
1175 printk(KERN_INFO
1176 "possible SYN flooding on port %d. Sending cookies.\n",
1177 ntohs(skb->h.th->dest));
1182 * Save and compile IPv4 options into the request_sock if needed.
1184 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1185 struct sk_buff *skb)
1187 struct ip_options *opt = &(IPCB(skb)->opt);
1188 struct ip_options *dopt = NULL;
1190 if (opt && opt->optlen) {
1191 int opt_size = optlength(opt);
1192 dopt = kmalloc(opt_size, GFP_ATOMIC);
1193 if (dopt) {
1194 if (ip_options_echo(dopt, skb)) {
1195 kfree(dopt);
1196 dopt = NULL;
1200 return dopt;
1203 struct request_sock_ops tcp_request_sock_ops = {
1204 .family = PF_INET,
1205 .obj_size = sizeof(struct tcp_request_sock),
1206 .rtx_syn_ack = tcp_v4_send_synack,
1207 .send_ack = tcp_v4_reqsk_send_ack,
1208 .destructor = tcp_v4_reqsk_destructor,
1209 .send_reset = tcp_v4_send_reset,
1212 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1214 struct inet_request_sock *ireq;
1215 struct tcp_options_received tmp_opt;
1216 struct request_sock *req;
1217 __u32 saddr = skb->nh.iph->saddr;
1218 __u32 daddr = skb->nh.iph->daddr;
1219 __u32 isn = TCP_SKB_CB(skb)->when;
1220 struct dst_entry *dst = NULL;
1221 #ifdef CONFIG_SYN_COOKIES
1222 int want_cookie = 0;
1223 #else
1224 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1225 #endif
1227 /* Never answer to SYNs send to broadcast or multicast */
1228 if (((struct rtable *)skb->dst)->rt_flags &
1229 (RTCF_BROADCAST | RTCF_MULTICAST))
1230 goto drop;
1232 /* TW buckets are converted to open requests without
1233 * limitations, they conserve resources and peer is
1234 * evidently real one.
1236 if (tcp_synq_is_full(sk) && !isn) {
1237 #ifdef CONFIG_SYN_COOKIES
1238 if (sysctl_tcp_syncookies) {
1239 want_cookie = 1;
1240 } else
1241 #endif
1242 goto drop;
1245 /* Accept backlog is full. If we have already queued enough
1246 * of warm entries in syn queue, drop request. It is better than
1247 * clogging syn queue with openreqs with exponentially increasing
1248 * timeout.
1250 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1251 goto drop;
1253 req = reqsk_alloc(&tcp_request_sock_ops);
1254 if (!req)
1255 goto drop;
1257 tcp_clear_options(&tmp_opt);
1258 tmp_opt.mss_clamp = 536;
1259 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1261 tcp_parse_options(skb, &tmp_opt, 0);
1263 if (want_cookie) {
1264 tcp_clear_options(&tmp_opt);
1265 tmp_opt.saw_tstamp = 0;
1268 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1269 /* Some OSes (unknown ones, but I see them on web server, which
1270 * contains information interesting only for windows'
1271 * users) do not send their stamp in SYN. It is easy case.
1272 * We simply do not advertise TS support.
1274 tmp_opt.saw_tstamp = 0;
1275 tmp_opt.tstamp_ok = 0;
1277 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1279 tcp_openreq_init(req, &tmp_opt, skb);
1281 ireq = inet_rsk(req);
1282 ireq->loc_addr = daddr;
1283 ireq->rmt_addr = saddr;
1284 ireq->opt = tcp_v4_save_options(sk, skb);
1285 if (!want_cookie)
1286 TCP_ECN_create_request(req, skb->h.th);
1288 if (want_cookie) {
1289 #ifdef CONFIG_SYN_COOKIES
1290 syn_flood_warning(skb);
1291 #endif
1292 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1293 } else if (!isn) {
1294 struct inet_peer *peer = NULL;
1296 /* VJ's idea. We save last timestamp seen
1297 * from the destination in peer table, when entering
1298 * state TIME-WAIT, and check against it before
1299 * accepting new connection request.
1301 * If "isn" is not zero, this request hit alive
1302 * timewait bucket, so that all the necessary checks
1303 * are made in the function processing timewait state.
1305 if (tmp_opt.saw_tstamp &&
1306 sysctl_tcp_tw_recycle &&
1307 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1308 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1309 peer->v4daddr == saddr) {
1310 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1311 (s32)(peer->tcp_ts - req->ts_recent) >
1312 TCP_PAWS_WINDOW) {
1313 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1314 dst_release(dst);
1315 goto drop_and_free;
1318 /* Kill the following clause, if you dislike this way. */
1319 else if (!sysctl_tcp_syncookies &&
1320 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1321 (sysctl_max_syn_backlog >> 2)) &&
1322 (!peer || !peer->tcp_ts_stamp) &&
1323 (!dst || !dst_metric(dst, RTAX_RTT))) {
1324 /* Without syncookies last quarter of
1325 * backlog is filled with destinations,
1326 * proven to be alive.
1327 * It means that we continue to communicate
1328 * to destinations, already remembered
1329 * to the moment of synflood.
1331 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1332 "request from %u.%u."
1333 "%u.%u/%u\n",
1334 NIPQUAD(saddr),
1335 ntohs(skb->h.th->source)));
1336 dst_release(dst);
1337 goto drop_and_free;
1340 isn = tcp_v4_init_sequence(sk, skb);
1342 tcp_rsk(req)->snt_isn = isn;
1344 if (tcp_v4_send_synack(sk, req, dst))
1345 goto drop_and_free;
1347 if (want_cookie) {
1348 reqsk_free(req);
1349 } else {
1350 tcp_v4_synq_add(sk, req);
1352 return 0;
1354 drop_and_free:
1355 reqsk_free(req);
1356 drop:
1357 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1358 return 0;
1363 * The three way handshake has completed - we got a valid synack -
1364 * now create the new socket.
1366 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1367 struct request_sock *req,
1368 struct dst_entry *dst)
1370 struct inet_request_sock *ireq;
1371 struct inet_sock *newinet;
1372 struct tcp_sock *newtp;
1373 struct sock *newsk;
1375 if (sk_acceptq_is_full(sk))
1376 goto exit_overflow;
1378 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1379 goto exit;
1381 newsk = tcp_create_openreq_child(sk, req, skb);
1382 if (!newsk)
1383 goto exit;
1385 sk_setup_caps(newsk, dst);
1387 newtp = tcp_sk(newsk);
1388 newinet = inet_sk(newsk);
1389 ireq = inet_rsk(req);
1390 newinet->daddr = ireq->rmt_addr;
1391 newinet->rcv_saddr = ireq->loc_addr;
1392 newinet->saddr = ireq->loc_addr;
1393 newinet->opt = ireq->opt;
1394 ireq->opt = NULL;
1395 newinet->mc_index = tcp_v4_iif(skb);
1396 newinet->mc_ttl = skb->nh.iph->ttl;
1397 newtp->ext_header_len = 0;
1398 if (newinet->opt)
1399 newtp->ext_header_len = newinet->opt->optlen;
1400 newinet->id = newtp->write_seq ^ jiffies;
1402 tcp_sync_mss(newsk, dst_mtu(dst));
1403 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1404 tcp_initialize_rcv_mss(newsk);
1406 __inet_hash(&tcp_hashinfo, newsk, 0);
1407 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1409 return newsk;
1411 exit_overflow:
1412 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1413 exit:
1414 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1415 dst_release(dst);
1416 return NULL;
1419 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1421 struct tcphdr *th = skb->h.th;
1422 struct iphdr *iph = skb->nh.iph;
1423 struct tcp_sock *tp = tcp_sk(sk);
1424 struct sock *nsk;
1425 struct request_sock **prev;
1426 /* Find possible connection requests. */
1427 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1428 iph->saddr, iph->daddr);
1429 if (req)
1430 return tcp_check_req(sk, skb, req, prev);
1432 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1433 th->source,
1434 skb->nh.iph->daddr,
1435 ntohs(th->dest),
1436 tcp_v4_iif(skb));
1438 if (nsk) {
1439 if (nsk->sk_state != TCP_TIME_WAIT) {
1440 bh_lock_sock(nsk);
1441 return nsk;
1443 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1444 return NULL;
1447 #ifdef CONFIG_SYN_COOKIES
1448 if (!th->rst && !th->syn && th->ack)
1449 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1450 #endif
1451 return sk;
1454 static int tcp_v4_checksum_init(struct sk_buff *skb)
1456 if (skb->ip_summed == CHECKSUM_HW) {
1457 skb->ip_summed = CHECKSUM_UNNECESSARY;
1458 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1459 skb->nh.iph->daddr, skb->csum))
1460 return 0;
1462 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1463 skb->ip_summed = CHECKSUM_NONE;
1465 if (skb->len <= 76) {
1466 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1467 skb->nh.iph->daddr,
1468 skb_checksum(skb, 0, skb->len, 0)))
1469 return -1;
1470 skb->ip_summed = CHECKSUM_UNNECESSARY;
1471 } else {
1472 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1473 skb->nh.iph->saddr,
1474 skb->nh.iph->daddr, 0);
1476 return 0;
1480 /* The socket must have it's spinlock held when we get
1481 * here.
1483 * We have a potential double-lock case here, so even when
1484 * doing backlog processing we use the BH locking scheme.
1485 * This is because we cannot sleep with the original spinlock
1486 * held.
1488 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1490 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1491 TCP_CHECK_TIMER(sk);
1492 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1493 goto reset;
1494 TCP_CHECK_TIMER(sk);
1495 return 0;
1498 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1499 goto csum_err;
1501 if (sk->sk_state == TCP_LISTEN) {
1502 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1503 if (!nsk)
1504 goto discard;
1506 if (nsk != sk) {
1507 if (tcp_child_process(sk, nsk, skb))
1508 goto reset;
1509 return 0;
1513 TCP_CHECK_TIMER(sk);
1514 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1515 goto reset;
1516 TCP_CHECK_TIMER(sk);
1517 return 0;
1519 reset:
1520 tcp_v4_send_reset(skb);
1521 discard:
1522 kfree_skb(skb);
1523 /* Be careful here. If this function gets more complicated and
1524 * gcc suffers from register pressure on the x86, sk (in %ebx)
1525 * might be destroyed here. This current version compiles correctly,
1526 * but you have been warned.
1528 return 0;
1530 csum_err:
1531 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1532 goto discard;
1536 * From tcp_input.c
1539 int tcp_v4_rcv(struct sk_buff *skb)
1541 struct tcphdr *th;
1542 struct sock *sk;
1543 int ret;
1545 if (skb->pkt_type != PACKET_HOST)
1546 goto discard_it;
1548 /* Count it even if it's bad */
1549 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1551 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1552 goto discard_it;
1554 th = skb->h.th;
1556 if (th->doff < sizeof(struct tcphdr) / 4)
1557 goto bad_packet;
1558 if (!pskb_may_pull(skb, th->doff * 4))
1559 goto discard_it;
1561 /* An explanation is required here, I think.
1562 * Packet length and doff are validated by header prediction,
1563 * provided case of th->doff==0 is elimineted.
1564 * So, we defer the checks. */
1565 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1566 tcp_v4_checksum_init(skb) < 0))
1567 goto bad_packet;
1569 th = skb->h.th;
1570 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1571 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1572 skb->len - th->doff * 4);
1573 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1574 TCP_SKB_CB(skb)->when = 0;
1575 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1576 TCP_SKB_CB(skb)->sacked = 0;
1578 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1579 skb->nh.iph->daddr, ntohs(th->dest),
1580 tcp_v4_iif(skb));
1582 if (!sk)
1583 goto no_tcp_socket;
1585 process:
1586 if (sk->sk_state == TCP_TIME_WAIT)
1587 goto do_time_wait;
1589 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1590 goto discard_and_relse;
1592 if (sk_filter(sk, skb, 0))
1593 goto discard_and_relse;
1595 skb->dev = NULL;
1597 bh_lock_sock(sk);
1598 ret = 0;
1599 if (!sock_owned_by_user(sk)) {
1600 if (!tcp_prequeue(sk, skb))
1601 ret = tcp_v4_do_rcv(sk, skb);
1602 } else
1603 sk_add_backlog(sk, skb);
1604 bh_unlock_sock(sk);
1606 sock_put(sk);
1608 return ret;
1610 no_tcp_socket:
1611 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1612 goto discard_it;
1614 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615 bad_packet:
1616 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1617 } else {
1618 tcp_v4_send_reset(skb);
1621 discard_it:
1622 /* Discard frame. */
1623 kfree_skb(skb);
1624 return 0;
1626 discard_and_relse:
1627 sock_put(sk);
1628 goto discard_it;
1630 do_time_wait:
1631 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1632 tcp_tw_put((struct tcp_tw_bucket *) sk);
1633 goto discard_it;
1636 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1637 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1638 tcp_tw_put((struct tcp_tw_bucket *) sk);
1639 goto discard_it;
1641 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1642 skb, th, skb->len)) {
1643 case TCP_TW_SYN: {
1644 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1645 ntohs(th->dest),
1646 tcp_v4_iif(skb));
1647 if (sk2) {
1648 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1649 tcp_tw_put((struct tcp_tw_bucket *)sk);
1650 sk = sk2;
1651 goto process;
1653 /* Fall through to ACK */
1655 case TCP_TW_ACK:
1656 tcp_v4_timewait_ack(sk, skb);
1657 break;
1658 case TCP_TW_RST:
1659 goto no_tcp_socket;
1660 case TCP_TW_SUCCESS:;
1662 goto discard_it;
1665 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1667 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1668 struct inet_sock *inet = inet_sk(sk);
1670 sin->sin_family = AF_INET;
1671 sin->sin_addr.s_addr = inet->daddr;
1672 sin->sin_port = inet->dport;
1675 /* VJ's idea. Save last timestamp seen from this destination
1676 * and hold it at least for normal timewait interval to use for duplicate
1677 * segment detection in subsequent connections, before they enter synchronized
1678 * state.
1681 int tcp_v4_remember_stamp(struct sock *sk)
1683 struct inet_sock *inet = inet_sk(sk);
1684 struct tcp_sock *tp = tcp_sk(sk);
1685 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1686 struct inet_peer *peer = NULL;
1687 int release_it = 0;
1689 if (!rt || rt->rt_dst != inet->daddr) {
1690 peer = inet_getpeer(inet->daddr, 1);
1691 release_it = 1;
1692 } else {
1693 if (!rt->peer)
1694 rt_bind_peer(rt, 1);
1695 peer = rt->peer;
1698 if (peer) {
1699 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1700 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1701 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1702 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1703 peer->tcp_ts = tp->rx_opt.ts_recent;
1705 if (release_it)
1706 inet_putpeer(peer);
1707 return 1;
1710 return 0;
1713 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1715 struct inet_peer *peer = NULL;
1717 peer = inet_getpeer(tw->tw_daddr, 1);
1719 if (peer) {
1720 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1721 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1722 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1723 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1724 peer->tcp_ts = tw->tw_ts_recent;
1726 inet_putpeer(peer);
1727 return 1;
1730 return 0;
1733 struct tcp_func ipv4_specific = {
1734 .queue_xmit = ip_queue_xmit,
1735 .send_check = tcp_v4_send_check,
1736 .rebuild_header = inet_sk_rebuild_header,
1737 .conn_request = tcp_v4_conn_request,
1738 .syn_recv_sock = tcp_v4_syn_recv_sock,
1739 .remember_stamp = tcp_v4_remember_stamp,
1740 .net_header_len = sizeof(struct iphdr),
1741 .setsockopt = ip_setsockopt,
1742 .getsockopt = ip_getsockopt,
1743 .addr2sockaddr = v4_addr2sockaddr,
1744 .sockaddr_len = sizeof(struct sockaddr_in),
1747 /* NOTE: A lot of things set to zero explicitly by call to
1748 * sk_alloc() so need not be done here.
1750 static int tcp_v4_init_sock(struct sock *sk)
1752 struct tcp_sock *tp = tcp_sk(sk);
1754 skb_queue_head_init(&tp->out_of_order_queue);
1755 tcp_init_xmit_timers(sk);
1756 tcp_prequeue_init(tp);
1758 tp->rto = TCP_TIMEOUT_INIT;
1759 tp->mdev = TCP_TIMEOUT_INIT;
1761 /* So many TCP implementations out there (incorrectly) count the
1762 * initial SYN frame in their delayed-ACK and congestion control
1763 * algorithms that we must have the following bandaid to talk
1764 * efficiently to them. -DaveM
1766 tp->snd_cwnd = 2;
1768 /* See draft-stevens-tcpca-spec-01 for discussion of the
1769 * initialization of these values.
1771 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1772 tp->snd_cwnd_clamp = ~0;
1773 tp->mss_cache = 536;
1775 tp->reordering = sysctl_tcp_reordering;
1776 tp->ca_ops = &tcp_init_congestion_ops;
1778 sk->sk_state = TCP_CLOSE;
1780 sk->sk_write_space = sk_stream_write_space;
1781 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1783 tp->af_specific = &ipv4_specific;
1785 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1786 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1788 atomic_inc(&tcp_sockets_allocated);
1790 return 0;
1793 int tcp_v4_destroy_sock(struct sock *sk)
1795 struct tcp_sock *tp = tcp_sk(sk);
1797 tcp_clear_xmit_timers(sk);
1799 tcp_cleanup_congestion_control(tp);
1801 /* Cleanup up the write buffer. */
1802 sk_stream_writequeue_purge(sk);
1804 /* Cleans up our, hopefully empty, out_of_order_queue. */
1805 __skb_queue_purge(&tp->out_of_order_queue);
1807 /* Clean prequeue, it must be empty really */
1808 __skb_queue_purge(&tp->ucopy.prequeue);
1810 /* Clean up a referenced TCP bind bucket. */
1811 if (inet_sk(sk)->bind_hash)
1812 inet_put_port(&tcp_hashinfo, sk);
1815 * If sendmsg cached page exists, toss it.
1817 if (sk->sk_sndmsg_page) {
1818 __free_page(sk->sk_sndmsg_page);
1819 sk->sk_sndmsg_page = NULL;
1822 atomic_dec(&tcp_sockets_allocated);
1824 return 0;
1827 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1829 #ifdef CONFIG_PROC_FS
1830 /* Proc filesystem TCP sock list dumping. */
1832 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1834 return hlist_empty(head) ? NULL :
1835 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1838 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1840 return tw->tw_node.next ?
1841 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1844 static void *listening_get_next(struct seq_file *seq, void *cur)
1846 struct tcp_sock *tp;
1847 struct hlist_node *node;
1848 struct sock *sk = cur;
1849 struct tcp_iter_state* st = seq->private;
1851 if (!sk) {
1852 st->bucket = 0;
1853 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1854 goto get_sk;
1857 ++st->num;
1859 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1860 struct request_sock *req = cur;
1862 tp = tcp_sk(st->syn_wait_sk);
1863 req = req->dl_next;
1864 while (1) {
1865 while (req) {
1866 if (req->rsk_ops->family == st->family) {
1867 cur = req;
1868 goto out;
1870 req = req->dl_next;
1872 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1873 break;
1874 get_req:
1875 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1877 sk = sk_next(st->syn_wait_sk);
1878 st->state = TCP_SEQ_STATE_LISTENING;
1879 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1880 } else {
1881 tp = tcp_sk(sk);
1882 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1883 if (reqsk_queue_len(&tp->accept_queue))
1884 goto start_req;
1885 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1886 sk = sk_next(sk);
1888 get_sk:
1889 sk_for_each_from(sk, node) {
1890 if (sk->sk_family == st->family) {
1891 cur = sk;
1892 goto out;
1894 tp = tcp_sk(sk);
1895 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1896 if (reqsk_queue_len(&tp->accept_queue)) {
1897 start_req:
1898 st->uid = sock_i_uid(sk);
1899 st->syn_wait_sk = sk;
1900 st->state = TCP_SEQ_STATE_OPENREQ;
1901 st->sbucket = 0;
1902 goto get_req;
1904 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1906 if (++st->bucket < INET_LHTABLE_SIZE) {
1907 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1908 goto get_sk;
1910 cur = NULL;
1911 out:
1912 return cur;
1915 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1917 void *rc = listening_get_next(seq, NULL);
1919 while (rc && *pos) {
1920 rc = listening_get_next(seq, rc);
1921 --*pos;
1923 return rc;
1926 static void *established_get_first(struct seq_file *seq)
1928 struct tcp_iter_state* st = seq->private;
1929 void *rc = NULL;
1931 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1932 struct sock *sk;
1933 struct hlist_node *node;
1934 struct tcp_tw_bucket *tw;
1936 /* We can reschedule _before_ having picked the target: */
1937 cond_resched_softirq();
1939 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1940 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1941 if (sk->sk_family != st->family) {
1942 continue;
1944 rc = sk;
1945 goto out;
1947 st->state = TCP_SEQ_STATE_TIME_WAIT;
1948 tw_for_each(tw, node,
1949 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1950 if (tw->tw_family != st->family) {
1951 continue;
1953 rc = tw;
1954 goto out;
1956 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1957 st->state = TCP_SEQ_STATE_ESTABLISHED;
1959 out:
1960 return rc;
1963 static void *established_get_next(struct seq_file *seq, void *cur)
1965 struct sock *sk = cur;
1966 struct tcp_tw_bucket *tw;
1967 struct hlist_node *node;
1968 struct tcp_iter_state* st = seq->private;
1970 ++st->num;
1972 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1973 tw = cur;
1974 tw = tw_next(tw);
1975 get_tw:
1976 while (tw && tw->tw_family != st->family) {
1977 tw = tw_next(tw);
1979 if (tw) {
1980 cur = tw;
1981 goto out;
1983 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1984 st->state = TCP_SEQ_STATE_ESTABLISHED;
1986 /* We can reschedule between buckets: */
1987 cond_resched_softirq();
1989 if (++st->bucket < tcp_hashinfo.ehash_size) {
1990 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1991 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1992 } else {
1993 cur = NULL;
1994 goto out;
1996 } else
1997 sk = sk_next(sk);
1999 sk_for_each_from(sk, node) {
2000 if (sk->sk_family == st->family)
2001 goto found;
2004 st->state = TCP_SEQ_STATE_TIME_WAIT;
2005 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2006 goto get_tw;
2007 found:
2008 cur = sk;
2009 out:
2010 return cur;
2013 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2015 void *rc = established_get_first(seq);
2017 while (rc && pos) {
2018 rc = established_get_next(seq, rc);
2019 --pos;
2021 return rc;
2024 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2026 void *rc;
2027 struct tcp_iter_state* st = seq->private;
2029 inet_listen_lock(&tcp_hashinfo);
2030 st->state = TCP_SEQ_STATE_LISTENING;
2031 rc = listening_get_idx(seq, &pos);
2033 if (!rc) {
2034 inet_listen_unlock(&tcp_hashinfo);
2035 local_bh_disable();
2036 st->state = TCP_SEQ_STATE_ESTABLISHED;
2037 rc = established_get_idx(seq, pos);
2040 return rc;
2043 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2045 struct tcp_iter_state* st = seq->private;
2046 st->state = TCP_SEQ_STATE_LISTENING;
2047 st->num = 0;
2048 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2051 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2053 void *rc = NULL;
2054 struct tcp_iter_state* st;
2056 if (v == SEQ_START_TOKEN) {
2057 rc = tcp_get_idx(seq, 0);
2058 goto out;
2060 st = seq->private;
2062 switch (st->state) {
2063 case TCP_SEQ_STATE_OPENREQ:
2064 case TCP_SEQ_STATE_LISTENING:
2065 rc = listening_get_next(seq, v);
2066 if (!rc) {
2067 inet_listen_unlock(&tcp_hashinfo);
2068 local_bh_disable();
2069 st->state = TCP_SEQ_STATE_ESTABLISHED;
2070 rc = established_get_first(seq);
2072 break;
2073 case TCP_SEQ_STATE_ESTABLISHED:
2074 case TCP_SEQ_STATE_TIME_WAIT:
2075 rc = established_get_next(seq, v);
2076 break;
2078 out:
2079 ++*pos;
2080 return rc;
2083 static void tcp_seq_stop(struct seq_file *seq, void *v)
2085 struct tcp_iter_state* st = seq->private;
2087 switch (st->state) {
2088 case TCP_SEQ_STATE_OPENREQ:
2089 if (v) {
2090 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2091 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2093 case TCP_SEQ_STATE_LISTENING:
2094 if (v != SEQ_START_TOKEN)
2095 inet_listen_unlock(&tcp_hashinfo);
2096 break;
2097 case TCP_SEQ_STATE_TIME_WAIT:
2098 case TCP_SEQ_STATE_ESTABLISHED:
2099 if (v)
2100 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2101 local_bh_enable();
2102 break;
2106 static int tcp_seq_open(struct inode *inode, struct file *file)
2108 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2109 struct seq_file *seq;
2110 struct tcp_iter_state *s;
2111 int rc;
2113 if (unlikely(afinfo == NULL))
2114 return -EINVAL;
2116 s = kmalloc(sizeof(*s), GFP_KERNEL);
2117 if (!s)
2118 return -ENOMEM;
2119 memset(s, 0, sizeof(*s));
2120 s->family = afinfo->family;
2121 s->seq_ops.start = tcp_seq_start;
2122 s->seq_ops.next = tcp_seq_next;
2123 s->seq_ops.show = afinfo->seq_show;
2124 s->seq_ops.stop = tcp_seq_stop;
2126 rc = seq_open(file, &s->seq_ops);
2127 if (rc)
2128 goto out_kfree;
2129 seq = file->private_data;
2130 seq->private = s;
2131 out:
2132 return rc;
2133 out_kfree:
2134 kfree(s);
2135 goto out;
2138 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2140 int rc = 0;
2141 struct proc_dir_entry *p;
2143 if (!afinfo)
2144 return -EINVAL;
2145 afinfo->seq_fops->owner = afinfo->owner;
2146 afinfo->seq_fops->open = tcp_seq_open;
2147 afinfo->seq_fops->read = seq_read;
2148 afinfo->seq_fops->llseek = seq_lseek;
2149 afinfo->seq_fops->release = seq_release_private;
2151 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2152 if (p)
2153 p->data = afinfo;
2154 else
2155 rc = -ENOMEM;
2156 return rc;
2159 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2161 if (!afinfo)
2162 return;
2163 proc_net_remove(afinfo->name);
2164 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2167 static void get_openreq4(struct sock *sk, struct request_sock *req,
2168 char *tmpbuf, int i, int uid)
2170 const struct inet_request_sock *ireq = inet_rsk(req);
2171 int ttd = req->expires - jiffies;
2173 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2174 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2176 ireq->loc_addr,
2177 ntohs(inet_sk(sk)->sport),
2178 ireq->rmt_addr,
2179 ntohs(ireq->rmt_port),
2180 TCP_SYN_RECV,
2181 0, 0, /* could print option size, but that is af dependent. */
2182 1, /* timers active (only the expire timer) */
2183 jiffies_to_clock_t(ttd),
2184 req->retrans,
2185 uid,
2186 0, /* non standard timer */
2187 0, /* open_requests have no inode */
2188 atomic_read(&sk->sk_refcnt),
2189 req);
2192 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2194 int timer_active;
2195 unsigned long timer_expires;
2196 struct tcp_sock *tp = tcp_sk(sp);
2197 struct inet_sock *inet = inet_sk(sp);
2198 unsigned int dest = inet->daddr;
2199 unsigned int src = inet->rcv_saddr;
2200 __u16 destp = ntohs(inet->dport);
2201 __u16 srcp = ntohs(inet->sport);
2203 if (tp->pending == TCP_TIME_RETRANS) {
2204 timer_active = 1;
2205 timer_expires = tp->timeout;
2206 } else if (tp->pending == TCP_TIME_PROBE0) {
2207 timer_active = 4;
2208 timer_expires = tp->timeout;
2209 } else if (timer_pending(&sp->sk_timer)) {
2210 timer_active = 2;
2211 timer_expires = sp->sk_timer.expires;
2212 } else {
2213 timer_active = 0;
2214 timer_expires = jiffies;
2217 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2218 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2219 i, src, srcp, dest, destp, sp->sk_state,
2220 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2221 timer_active,
2222 jiffies_to_clock_t(timer_expires - jiffies),
2223 tp->retransmits,
2224 sock_i_uid(sp),
2225 tp->probes_out,
2226 sock_i_ino(sp),
2227 atomic_read(&sp->sk_refcnt), sp,
2228 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2229 tp->snd_cwnd,
2230 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2233 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2235 unsigned int dest, src;
2236 __u16 destp, srcp;
2237 int ttd = tw->tw_ttd - jiffies;
2239 if (ttd < 0)
2240 ttd = 0;
2242 dest = tw->tw_daddr;
2243 src = tw->tw_rcv_saddr;
2244 destp = ntohs(tw->tw_dport);
2245 srcp = ntohs(tw->tw_sport);
2247 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2248 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2249 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2250 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2251 atomic_read(&tw->tw_refcnt), tw);
2254 #define TMPSZ 150
2256 static int tcp4_seq_show(struct seq_file *seq, void *v)
2258 struct tcp_iter_state* st;
2259 char tmpbuf[TMPSZ + 1];
2261 if (v == SEQ_START_TOKEN) {
2262 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2263 " sl local_address rem_address st tx_queue "
2264 "rx_queue tr tm->when retrnsmt uid timeout "
2265 "inode");
2266 goto out;
2268 st = seq->private;
2270 switch (st->state) {
2271 case TCP_SEQ_STATE_LISTENING:
2272 case TCP_SEQ_STATE_ESTABLISHED:
2273 get_tcp4_sock(v, tmpbuf, st->num);
2274 break;
2275 case TCP_SEQ_STATE_OPENREQ:
2276 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2277 break;
2278 case TCP_SEQ_STATE_TIME_WAIT:
2279 get_timewait4_sock(v, tmpbuf, st->num);
2280 break;
2282 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2283 out:
2284 return 0;
2287 static struct file_operations tcp4_seq_fops;
2288 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2289 .owner = THIS_MODULE,
2290 .name = "tcp",
2291 .family = AF_INET,
2292 .seq_show = tcp4_seq_show,
2293 .seq_fops = &tcp4_seq_fops,
2296 int __init tcp4_proc_init(void)
2298 return tcp_proc_register(&tcp4_seq_afinfo);
2301 void tcp4_proc_exit(void)
2303 tcp_proc_unregister(&tcp4_seq_afinfo);
2305 #endif /* CONFIG_PROC_FS */
2307 struct proto tcp_prot = {
2308 .name = "TCP",
2309 .owner = THIS_MODULE,
2310 .close = tcp_close,
2311 .connect = tcp_v4_connect,
2312 .disconnect = tcp_disconnect,
2313 .accept = tcp_accept,
2314 .ioctl = tcp_ioctl,
2315 .init = tcp_v4_init_sock,
2316 .destroy = tcp_v4_destroy_sock,
2317 .shutdown = tcp_shutdown,
2318 .setsockopt = tcp_setsockopt,
2319 .getsockopt = tcp_getsockopt,
2320 .sendmsg = tcp_sendmsg,
2321 .recvmsg = tcp_recvmsg,
2322 .backlog_rcv = tcp_v4_do_rcv,
2323 .hash = tcp_v4_hash,
2324 .unhash = tcp_unhash,
2325 .get_port = tcp_v4_get_port,
2326 .enter_memory_pressure = tcp_enter_memory_pressure,
2327 .sockets_allocated = &tcp_sockets_allocated,
2328 .memory_allocated = &tcp_memory_allocated,
2329 .memory_pressure = &tcp_memory_pressure,
2330 .sysctl_mem = sysctl_tcp_mem,
2331 .sysctl_wmem = sysctl_tcp_wmem,
2332 .sysctl_rmem = sysctl_tcp_rmem,
2333 .max_header = MAX_TCP_HEADER,
2334 .obj_size = sizeof(struct tcp_sock),
2335 .rsk_prot = &tcp_request_sock_ops,
2340 void __init tcp_v4_init(struct net_proto_family *ops)
2342 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2343 if (err < 0)
2344 panic("Failed to create the TCP control socket.\n");
2345 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2346 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2348 /* Unhash it so that IP input processing does not even
2349 * see it, we do not wish this socket to see incoming
2350 * packets.
2352 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2355 EXPORT_SYMBOL(ipv4_specific);
2356 EXPORT_SYMBOL(inet_bind_bucket_create);
2357 EXPORT_SYMBOL(tcp_hashinfo);
2358 EXPORT_SYMBOL(tcp_prot);
2359 EXPORT_SYMBOL(tcp_unhash);
2360 EXPORT_SYMBOL(tcp_v4_conn_request);
2361 EXPORT_SYMBOL(tcp_v4_connect);
2362 EXPORT_SYMBOL(tcp_v4_do_rcv);
2363 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2364 EXPORT_SYMBOL(tcp_v4_send_check);
2365 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2367 #ifdef CONFIG_PROC_FS
2368 EXPORT_SYMBOL(tcp_proc_register);
2369 EXPORT_SYMBOL(tcp_proc_unregister);
2370 #endif
2371 EXPORT_SYMBOL(sysctl_local_port_range);
2372 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2373 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);