Linux 2.4.0-test7-pre6
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blob0c1e678ef4bf9af1a44142fd02ea398787cb5b66
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.212 2000/08/18 17:10:04 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/cache.h>
55 #include <linux/init.h>
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
60 #include <net/inet_common.h>
62 #include <linux/inet.h>
63 #include <linux/stddef.h>
64 #include <linux/ipsec.h>
66 extern int sysctl_ip_dynaddr;
68 /* Check TCP sequence numbers in ICMP packets. */
69 #define ICMP_MIN_LENGTH 8
71 /* Socket used for sending RSTs */
72 static struct inode tcp_inode;
73 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
75 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
76 struct sk_buff *skb);
78 /* This is for sockets with full identity only. Sockets here will always
79 * be without wildcards and will have the following invariant:
80 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
82 * First half of the table is for sockets not in TIME_WAIT, second half
83 * is for TIME_WAIT sockets only.
85 struct tcp_ehash_bucket *tcp_ehash = NULL;
87 /* Ok, let's try this, I give up, we do need a local binding
88 * TCP hash as well as the others for fast bind/connect.
90 struct tcp_bind_hashbucket *tcp_bhash = NULL;
92 int tcp_bhash_size = 0;
93 int tcp_ehash_size = 0;
95 /* All sockets in TCP_LISTEN state will be in here. This is the only table
96 * where wildcard'd TCP sockets can exist. Hash function here is just local
97 * port number.
99 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
100 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
101 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
102 (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
104 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
105 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
106 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
108 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
111 * This array holds the first and last local port number.
112 * For high-usage systems, use sysctl to change this to
113 * 32768-61000
115 int sysctl_local_port_range[2] = { 1024, 4999 };
116 int tcp_port_rover = (1024 - 1);
118 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
119 __u32 faddr, __u16 fport)
121 int h = ((laddr ^ lport) ^ (faddr ^ fport));
122 h ^= h>>16;
123 h ^= h>>8;
124 return h & (tcp_ehash_size - 1);
127 static __inline__ int tcp_sk_hashfn(struct sock *sk)
129 __u32 laddr = sk->rcv_saddr;
130 __u16 lport = sk->num;
131 __u32 faddr = sk->daddr;
132 __u16 fport = sk->dport;
134 return tcp_hashfn(laddr, lport, faddr, fport);
137 /* Allocate and initialize a new TCP local port bind bucket.
138 * The bindhash mutex for snum's hash chain must be held here.
140 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
141 unsigned short snum)
143 struct tcp_bind_bucket *tb;
145 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
146 if(tb != NULL) {
147 tb->port = snum;
148 tb->fastreuse = 0;
149 tb->owners = NULL;
150 if((tb->next = head->chain) != NULL)
151 tb->next->pprev = &tb->next;
152 head->chain = tb;
153 tb->pprev = &head->chain;
155 return tb;
158 /* Caller must disable local BH processing. */
159 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
161 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
162 struct tcp_bind_bucket *tb;
164 spin_lock(&head->lock);
165 tb = (struct tcp_bind_bucket *)sk->prev;
166 if ((child->bind_next = tb->owners) != NULL)
167 tb->owners->bind_pprev = &child->bind_next;
168 tb->owners = child;
169 child->bind_pprev = &tb->owners;
170 child->prev = (struct sock *) tb;
171 spin_unlock(&head->lock);
174 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
176 local_bh_disable();
177 __tcp_inherit_port(sk, child);
178 local_bh_enable();
181 /* Obtain a reference to a local port for the given sock,
182 * if snum is zero it means select any available local port.
184 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
186 struct tcp_bind_hashbucket *head;
187 struct tcp_bind_bucket *tb;
188 int ret;
190 local_bh_disable();
191 if (snum == 0) {
192 int low = sysctl_local_port_range[0];
193 int high = sysctl_local_port_range[1];
194 int remaining = (high - low) + 1;
195 int rover;
197 spin_lock(&tcp_portalloc_lock);
198 rover = tcp_port_rover;
199 do { rover++;
200 if ((rover < low) || (rover > high))
201 rover = low;
202 head = &tcp_bhash[tcp_bhashfn(rover)];
203 spin_lock(&head->lock);
204 for (tb = head->chain; tb; tb = tb->next)
205 if (tb->port == rover)
206 goto next;
207 break;
208 next:
209 spin_unlock(&head->lock);
210 } while (--remaining > 0);
211 tcp_port_rover = rover;
212 spin_unlock(&tcp_portalloc_lock);
214 /* Exhausted local port range during search? */
215 ret = 1;
216 if (remaining <= 0)
217 goto fail;
219 /* OK, here is the one we will use. HEAD is
220 * non-NULL and we hold it's mutex.
222 snum = rover;
223 tb = NULL;
224 } else {
225 head = &tcp_bhash[tcp_bhashfn(snum)];
226 spin_lock(&head->lock);
227 for (tb = head->chain; tb != NULL; tb = tb->next)
228 if (tb->port == snum)
229 break;
231 if (tb != NULL && tb->owners != NULL) {
232 if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
233 goto success;
234 } else {
235 struct sock *sk2 = tb->owners;
236 int sk_reuse = sk->reuse;
238 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
239 if (sk != sk2 &&
240 sk->bound_dev_if == sk2->bound_dev_if) {
241 if (!sk_reuse ||
242 !sk2->reuse ||
243 sk2->state == TCP_LISTEN) {
244 if (!sk2->rcv_saddr ||
245 !sk->rcv_saddr ||
246 (sk2->rcv_saddr == sk->rcv_saddr))
247 break;
251 /* If we found a conflict, fail. */
252 ret = 1;
253 if (sk2 != NULL)
254 goto fail_unlock;
257 ret = 1;
258 if (tb == NULL &&
259 (tb = tcp_bucket_create(head, snum)) == NULL)
260 goto fail_unlock;
261 if (tb->owners == NULL) {
262 if (sk->reuse && sk->state != TCP_LISTEN)
263 tb->fastreuse = 1;
264 else
265 tb->fastreuse = 0;
266 } else if (tb->fastreuse &&
267 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
268 tb->fastreuse = 0;
269 success:
270 sk->num = snum;
271 if (sk->prev == NULL) {
272 if ((sk->bind_next = tb->owners) != NULL)
273 tb->owners->bind_pprev = &sk->bind_next;
274 tb->owners = sk;
275 sk->bind_pprev = &tb->owners;
276 sk->prev = (struct sock *) tb;
277 } else {
278 BUG_TRAP(sk->prev == (struct sock *) tb);
280 ret = 0;
282 fail_unlock:
283 spin_unlock(&head->lock);
284 fail:
285 local_bh_enable();
286 return ret;
289 /* Get rid of any references to a local port held by the
290 * given sock.
292 __inline__ void __tcp_put_port(struct sock *sk)
294 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
295 struct tcp_bind_bucket *tb;
297 spin_lock(&head->lock);
298 tb = (struct tcp_bind_bucket *) sk->prev;
299 if (sk->bind_next)
300 sk->bind_next->bind_pprev = sk->bind_pprev;
301 *(sk->bind_pprev) = sk->bind_next;
302 sk->prev = NULL;
303 if (tb->owners == NULL) {
304 if (tb->next)
305 tb->next->pprev = tb->pprev;
306 *(tb->pprev) = tb->next;
307 kmem_cache_free(tcp_bucket_cachep, tb);
309 spin_unlock(&head->lock);
312 void tcp_put_port(struct sock *sk)
314 local_bh_disable();
315 __tcp_put_port(sk);
316 local_bh_enable();
319 /* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
320 * Look, when several writers sleep and reader wakes them up, all but one
321 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
322 * this, _but_ remember, it adds useless work on UP machines (wake up each
323 * exclusive lock release). It should be ifdefed really.
326 void tcp_listen_wlock(void)
328 write_lock(&tcp_lhash_lock);
330 if (atomic_read(&tcp_lhash_users)) {
331 DECLARE_WAITQUEUE(wait, current);
333 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
334 for (;;) {
335 set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
336 if (atomic_read(&tcp_lhash_users) == 0)
337 break;
338 write_unlock_bh(&tcp_lhash_lock);
339 schedule();
340 write_lock_bh(&tcp_lhash_lock);
343 __set_current_state(TASK_RUNNING);
344 remove_wait_queue(&tcp_lhash_wait, &wait);
348 static __inline__ void __tcp_v4_hash(struct sock *sk)
350 struct sock **skp;
351 rwlock_t *lock;
353 BUG_TRAP(sk->pprev==NULL);
354 if(sk->state == TCP_LISTEN) {
355 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
356 lock = &tcp_lhash_lock;
357 tcp_listen_wlock();
358 } else {
359 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
360 lock = &tcp_ehash[sk->hashent].lock;
361 write_lock(lock);
363 if((sk->next = *skp) != NULL)
364 (*skp)->pprev = &sk->next;
365 *skp = sk;
366 sk->pprev = skp;
367 sock_prot_inc_use(sk->prot);
368 write_unlock(lock);
369 if (sk->state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 static void tcp_v4_hash(struct sock *sk)
375 if (sk->state != TCP_CLOSE) {
376 local_bh_disable();
377 __tcp_v4_hash(sk);
378 local_bh_enable();
382 void tcp_unhash(struct sock *sk)
384 rwlock_t *lock;
386 if (sk->state == TCP_LISTEN) {
387 local_bh_disable();
388 tcp_listen_wlock();
389 lock = &tcp_lhash_lock;
390 } else {
391 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
392 lock = &head->lock;
393 write_lock_bh(&head->lock);
396 if(sk->pprev) {
397 if(sk->next)
398 sk->next->pprev = sk->pprev;
399 *sk->pprev = sk->next;
400 sk->pprev = NULL;
401 sock_prot_dec_use(sk->prot);
403 write_unlock_bh(lock);
404 if (sk->state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
408 /* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
414 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
416 struct sock *result = NULL;
417 int score, hiscore;
419 hiscore=0;
420 for(; sk; sk = sk->next) {
421 if(sk->num == hnum) {
422 __u32 rcv_saddr = sk->rcv_saddr;
424 score = 1;
425 if(rcv_saddr) {
426 if (rcv_saddr != daddr)
427 continue;
428 score++;
430 if (sk->bound_dev_if) {
431 if (sk->bound_dev_if != dif)
432 continue;
433 score++;
435 if (score == 3)
436 return sk;
437 if (score > hiscore) {
438 hiscore = score;
439 result = sk;
443 return result;
446 /* Optimize the common listener case. */
447 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
449 struct sock *sk;
451 read_lock(&tcp_lhash_lock);
452 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
453 if (sk) {
454 if (sk->num == hnum &&
455 sk->next == NULL &&
456 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
457 !sk->bound_dev_if)
458 goto sherry_cache;
459 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
461 if (sk) {
462 sherry_cache:
463 sock_hold(sk);
465 read_unlock(&tcp_lhash_lock);
466 return sk;
469 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
470 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
472 * Local BH must be disabled here.
475 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
476 u32 daddr, u16 hnum, int dif)
478 struct tcp_ehash_bucket *head;
479 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
480 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
481 struct sock *sk;
482 int hash;
484 /* Optimize here for direct hit, only listening connections can
485 * have wildcards anyways.
487 hash = tcp_hashfn(daddr, hnum, saddr, sport);
488 head = &tcp_ehash[hash];
489 read_lock(&head->lock);
490 for(sk = head->chain; sk; sk = sk->next) {
491 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
492 goto hit; /* You sunk my battleship! */
495 /* Must check for a TIME_WAIT'er before going to listener hash. */
496 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
497 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
498 goto hit;
499 read_unlock(&head->lock);
501 return NULL;
503 hit:
504 sock_hold(sk);
505 read_unlock(&head->lock);
506 return sk;
509 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
510 u32 daddr, u16 hnum, int dif)
512 struct sock *sk;
514 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
516 if (sk)
517 return sk;
519 return tcp_v4_lookup_listener(daddr, hnum, dif);
522 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
524 struct sock *sk;
526 local_bh_disable();
527 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
528 local_bh_enable();
530 return sk;
533 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
535 return secure_tcp_sequence_number(skb->nh.iph->daddr,
536 skb->nh.iph->saddr,
537 skb->h.th->dest,
538 skb->h.th->source);
541 static int tcp_v4_check_established(struct sock *sk)
543 u32 daddr = sk->rcv_saddr;
544 u32 saddr = sk->daddr;
545 int dif = sk->bound_dev_if;
546 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
547 __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
548 int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
549 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
550 struct sock *sk2, **skp;
551 struct tcp_tw_bucket *tw;
553 write_lock_bh(&head->lock);
555 /* Check TIME-WAIT sockets first. */
556 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
557 skp = &sk2->next) {
558 tw = (struct tcp_tw_bucket*)sk2;
560 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
561 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
563 /* With PAWS, it is safe from the viewpoint
564 of data integrity. Even without PAWS it
565 is safe provided sequence spaces do not
566 overlap i.e. at data rates <= 80Mbit/sec.
568 Actually, the idea is close to VJ's one,
569 only timestamp cache is held not per host,
570 but per port pair and TW bucket is used
571 as state holder.
573 If TW bucket has been already destroyed we
574 fall back to VJ's scheme and use initial
575 timestamp retrieved from peer table.
577 if (tw->ts_recent_stamp) {
578 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
579 tp->write_seq = 1;
580 tp->ts_recent = tw->ts_recent;
581 tp->ts_recent_stamp = tw->ts_recent_stamp;
582 sock_hold(sk2);
583 skp = &head->chain;
584 goto unique;
585 } else
586 goto not_unique;
589 tw = NULL;
591 /* And established part... */
592 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
593 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
594 goto not_unique;
597 unique:
598 BUG_TRAP(sk->pprev==NULL);
599 if ((sk->next = *skp) != NULL)
600 (*skp)->pprev = &sk->next;
602 *skp = sk;
603 sk->pprev = skp;
604 sk->hashent = hash;
605 sock_prot_inc_use(sk->prot);
606 write_unlock_bh(&head->lock);
608 if (tw) {
609 /* Silly. Should hash-dance instead... */
610 local_bh_disable();
611 tcp_tw_deschedule(tw);
612 tcp_timewait_kill(tw);
613 NET_INC_STATS_BH(TimeWaitRecycled);
614 local_bh_enable();
616 tcp_tw_put(tw);
619 return 0;
621 not_unique:
622 write_unlock_bh(&head->lock);
623 return -EADDRNOTAVAIL;
626 /* Hash SYN-SENT socket to established hash table after
627 * checking that it is unique. Note, that without kernel lock
628 * we MUST make these two operations atomically.
630 * Optimization: if it is bound and tcp_bind_bucket has the only
631 * owner (us), we need not to scan established bucket.
634 int tcp_v4_hash_connecting(struct sock *sk)
636 unsigned short snum = sk->num;
637 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
638 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
640 spin_lock_bh(&head->lock);
641 if (tb->owners == sk && sk->bind_next == NULL) {
642 __tcp_v4_hash(sk);
643 spin_unlock_bh(&head->lock);
644 return 0;
645 } else {
646 spin_unlock_bh(&head->lock);
648 /* No definite answer... Walk to established hash table */
649 return tcp_v4_check_established(sk);
653 /* This will initiate an outgoing connection. */
654 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
656 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
657 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
658 struct sk_buff *buff;
659 struct rtable *rt;
660 u32 daddr, nexthop;
661 int tmp;
662 int err;
664 if (addr_len < sizeof(struct sockaddr_in))
665 return(-EINVAL);
667 if (usin->sin_family != AF_INET)
668 return(-EAFNOSUPPORT);
670 nexthop = daddr = usin->sin_addr.s_addr;
671 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
672 if (daddr == 0)
673 return -EINVAL;
674 nexthop = sk->protinfo.af_inet.opt->faddr;
677 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
678 RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
679 if (tmp < 0)
680 return tmp;
682 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
683 ip_rt_put(rt);
684 return -ENETUNREACH;
687 __sk_dst_set(sk, &rt->u.dst);
689 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
690 daddr = rt->rt_dst;
692 err = -ENOBUFS;
693 buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
695 if (buff == NULL)
696 goto failure;
698 if (!sk->saddr)
699 sk->saddr = rt->rt_src;
700 sk->rcv_saddr = sk->saddr;
702 if (tp->ts_recent_stamp && sk->daddr != daddr) {
703 /* Reset inherited state */
704 tp->ts_recent = 0;
705 tp->ts_recent_stamp = 0;
706 tp->write_seq = 0;
709 if (sysctl_tcp_tw_recycle &&
710 !tp->ts_recent_stamp &&
711 rt->rt_dst == daddr) {
712 struct inet_peer *peer = rt_get_peer(rt);
714 /* VJ's idea. We save last timestamp seen from
715 * the destination in peer table, when entering state TIME-WAIT
716 * and initialize ts_recent from it, when trying new connection.
719 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
720 tp->ts_recent_stamp = peer->tcp_ts_stamp;
721 tp->ts_recent = peer->tcp_ts;
725 sk->dport = usin->sin_port;
726 sk->daddr = daddr;
728 if (!tp->write_seq)
729 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
730 sk->sport, usin->sin_port);
732 tp->ext_header_len = 0;
733 if (sk->protinfo.af_inet.opt)
734 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
736 tp->mss_clamp = 536;
738 err = tcp_connect(sk, buff);
739 if (err == 0)
740 return 0;
742 failure:
743 __sk_dst_reset(sk);
744 sk->dport = 0;
745 return err;
748 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
750 return ((struct rtable*)skb->dst)->rt_iif;
753 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
755 unsigned h = raddr ^ rport;
756 h ^= h>>16;
757 h ^= h>>8;
758 return h&(TCP_SYNQ_HSIZE-1);
761 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
762 struct iphdr *iph,
763 struct tcphdr *th,
764 struct open_request ***prevp)
766 struct tcp_listen_opt *lopt = tp->listen_opt;
767 struct open_request *req, **prev;
768 __u16 rport = th->source;
769 __u32 raddr = iph->saddr;
771 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
772 (req = *prev) != NULL;
773 prev = &req->dl_next) {
774 if (req->rmt_port == rport &&
775 req->af.v4_req.rmt_addr == raddr &&
776 req->af.v4_req.loc_addr == iph->daddr &&
777 TCP_INET_FAMILY(req->class->family)) {
778 BUG_TRAP(req->sk == NULL);
779 *prevp = prev;
780 return req;
784 return NULL;
787 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
789 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
790 struct tcp_listen_opt *lopt = tp->listen_opt;
791 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
793 req->expires = jiffies + TCP_TIMEOUT_INIT;
794 req->retrans = 0;
795 req->sk = NULL;
796 req->index = h;
797 req->dl_next = lopt->syn_table[h];
799 write_lock(&tp->syn_wait_lock);
800 lopt->syn_table[h] = req;
801 write_unlock(&tp->syn_wait_lock);
803 tcp_synq_added(sk);
808 * This routine does path mtu discovery as defined in RFC1191.
810 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
812 struct dst_entry *dst;
813 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
815 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
816 * send out by Linux are always <576bytes so they should go through
817 * unfragmented).
819 if (sk->state == TCP_LISTEN)
820 return;
822 /* We don't check in the destentry if pmtu discovery is forbidden
823 * on this route. We just assume that no packet_to_big packets
824 * are send back when pmtu discovery is not active.
825 * There is a small race when the user changes this flag in the
826 * route, but I think that's acceptable.
828 if ((dst = __sk_dst_check(sk, 0)) == NULL)
829 return;
831 ip_rt_update_pmtu(dst, mtu);
833 /* Something is about to be wrong... Remember soft error
834 * for the case, if this connection will not able to recover.
836 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
837 sk->err_soft = EMSGSIZE;
839 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
840 tp->pmtu_cookie > dst->pmtu) {
841 tcp_sync_mss(sk, dst->pmtu);
843 /* Resend the TCP packet because it's
844 * clear that the old packet has been
845 * dropped. This is the new "fast" path mtu
846 * discovery.
848 tcp_simple_retransmit(sk);
849 } /* else let the usual retransmit timer handle it */
853 * This routine is called by the ICMP module when it gets some
854 * sort of error condition. If err < 0 then the socket should
855 * be closed and the error returned to the user. If err > 0
856 * it's just the icmp type << 8 | icmp code. After adjustment
857 * header points to the first 8 bytes of the tcp header. We need
858 * to find the appropriate port.
860 * The locking strategy used here is very "optimistic". When
861 * someone else accesses the socket the ICMP is just dropped
862 * and for some paths there is no check at all.
863 * A more general error queue to queue errors for later handling
864 * is probably better.
868 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
870 struct iphdr *iph = (struct iphdr*)dp;
871 struct tcphdr *th;
872 struct tcp_opt *tp;
873 int type = skb->h.icmph->type;
874 int code = skb->h.icmph->code;
875 #if ICMP_MIN_LENGTH < 14
876 int no_flags = 0;
877 #else
878 #define no_flags 0
879 #endif
880 struct sock *sk;
881 __u32 seq;
882 int err;
884 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
885 ICMP_INC_STATS_BH(IcmpInErrors);
886 return;
888 #if ICMP_MIN_LENGTH < 14
889 if (len < (iph->ihl << 2) + 14)
890 no_flags = 1;
891 #endif
893 th = (struct tcphdr*)(dp+(iph->ihl<<2));
895 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
896 if (sk == NULL) {
897 ICMP_INC_STATS_BH(IcmpInErrors);
898 return;
900 if (sk->state == TCP_TIME_WAIT) {
901 tcp_tw_put((struct tcp_tw_bucket*)sk);
902 return;
905 bh_lock_sock(sk);
906 /* If too many ICMPs get dropped on busy
907 * servers this needs to be solved differently.
909 if (sk->lock.users != 0)
910 NET_INC_STATS_BH(LockDroppedIcmps);
912 if (sk->state == TCP_CLOSE)
913 goto out;
915 tp = &sk->tp_pinfo.af_tcp;
916 seq = ntohl(th->seq);
917 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
918 NET_INC_STATS(OutOfWindowIcmps);
919 goto out;
922 switch (type) {
923 case ICMP_SOURCE_QUENCH:
924 /* This is deprecated, but if someone generated it,
925 * we have no reasons to ignore it.
927 if (sk->lock.users == 0)
928 tcp_enter_cwr(tp);
929 goto out;
930 case ICMP_PARAMETERPROB:
931 err = EPROTO;
932 break;
933 case ICMP_DEST_UNREACH:
934 if (code > NR_ICMP_UNREACH)
935 goto out;
937 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
938 if (sk->lock.users == 0)
939 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
940 goto out;
943 err = icmp_err_convert[code].errno;
944 break;
945 case ICMP_TIME_EXCEEDED:
946 err = EHOSTUNREACH;
947 break;
948 default:
949 goto out;
952 switch (sk->state) {
953 struct open_request *req, **prev;
954 case TCP_LISTEN:
955 if (sk->lock.users != 0)
956 goto out;
958 /* The final ACK of the handshake should be already
959 * handled in the new socket context, not here.
960 * Strictly speaking - an ICMP error for the final
961 * ACK should set the opening flag, but that is too
962 * complicated right now.
964 if (!no_flags && !th->syn && !th->ack)
965 goto out;
967 req = tcp_v4_search_req(tp, iph, th, &prev);
968 if (!req)
969 goto out;
971 /* ICMPs are not backlogged, hence we cannot get
972 an established socket here.
974 BUG_TRAP(req->sk == NULL);
976 if (seq != req->snt_isn) {
977 NET_INC_STATS_BH(OutOfWindowIcmps);
978 goto out;
982 * Still in SYN_RECV, just remove it silently.
983 * There is no good way to pass the error to the newly
984 * created socket, and POSIX does not want network
985 * errors returned from accept().
987 tcp_synq_drop(sk, req, prev);
988 goto out;
990 case TCP_SYN_SENT:
991 case TCP_SYN_RECV: /* Cannot happen.
992 It can f.e. if SYNs crossed.
994 if (!no_flags && !th->syn)
995 goto out;
996 if (sk->lock.users == 0) {
997 TCP_INC_STATS_BH(TcpAttemptFails);
998 sk->err = err;
1000 sk->error_report(sk);
1002 tcp_done(sk);
1003 } else {
1004 sk->err_soft = err;
1006 goto out;
1009 /* If we've already connected we will keep trying
1010 * until we time out, or the user gives up.
1012 * rfc1122 4.2.3.9 allows to consider as hard errors
1013 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1014 * but it is obsoleted by pmtu discovery).
1016 * Note, that in modern internet, where routing is unreliable
1017 * and in each dark corner broken firewalls sit, sending random
1018 * errors ordered by their masters even this two messages finally lose
1019 * their original sense (even Linux sends invalid PORT_UNREACHs)
1021 * Now we are in compliance with RFCs.
1022 * --ANK (980905)
1025 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1026 sk->err = err;
1027 sk->error_report(sk);
1028 } else { /* Only an error on timeout */
1029 sk->err_soft = err;
1032 out:
1033 bh_unlock_sock(sk);
1034 sock_put(sk);
1037 /* This routine computes an IPv4 TCP checksum. */
1038 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1039 struct sk_buff *skb)
1041 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1042 csum_partial((char *)th, th->doff<<2, skb->csum));
1046 * This routine will send an RST to the other tcp.
1048 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1049 * for reset.
1050 * Answer: if a packet caused RST, it is not for a socket
1051 * existing in our system, if it is matched to a socket,
1052 * it is just duplicate segment or bug in other side's TCP.
1053 * So that we build reply only basing on parameters
1054 * arrived with segment.
1055 * Exception: precedence violation. We do not implement it in any case.
1058 static void tcp_v4_send_reset(struct sk_buff *skb)
1060 struct tcphdr *th = skb->h.th;
1061 struct tcphdr rth;
1062 struct ip_reply_arg arg;
1064 /* Never send a reset in response to a reset. */
1065 if (th->rst)
1066 return;
1068 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1069 return;
1071 /* Swap the send and the receive. */
1072 memset(&rth, 0, sizeof(struct tcphdr));
1073 rth.dest = th->source;
1074 rth.source = th->dest;
1075 rth.doff = sizeof(struct tcphdr)/4;
1076 rth.rst = 1;
1078 if (th->ack) {
1079 rth.seq = th->ack_seq;
1080 } else {
1081 rth.ack = 1;
1082 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1083 + skb->len - (th->doff<<2));
1086 memset(&arg, 0, sizeof arg);
1087 arg.iov[0].iov_base = (unsigned char *)&rth;
1088 arg.iov[0].iov_len = sizeof rth;
1089 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1090 skb->nh.iph->saddr, /*XXX*/
1091 sizeof(struct tcphdr),
1092 IPPROTO_TCP,
1093 0);
1094 arg.n_iov = 1;
1095 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1097 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1099 TCP_INC_STATS_BH(TcpOutSegs);
1100 TCP_INC_STATS_BH(TcpOutRsts);
1103 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1104 outside socket context is ugly, certainly. What can I do?
1107 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1109 struct tcphdr *th = skb->h.th;
1110 struct {
1111 struct tcphdr th;
1112 u32 tsopt[3];
1113 } rep;
1114 struct ip_reply_arg arg;
1116 memset(&rep.th, 0, sizeof(struct tcphdr));
1117 memset(&arg, 0, sizeof arg);
1119 arg.iov[0].iov_base = (unsigned char *)&rep;
1120 arg.iov[0].iov_len = sizeof(rep.th);
1121 arg.n_iov = 1;
1122 if (ts) {
1123 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1124 (TCPOPT_NOP << 16) |
1125 (TCPOPT_TIMESTAMP << 8) |
1126 TCPOLEN_TIMESTAMP);
1127 rep.tsopt[1] = htonl(tcp_time_stamp);
1128 rep.tsopt[2] = htonl(ts);
1129 arg.iov[0].iov_len = sizeof(rep);
1132 /* Swap the send and the receive. */
1133 rep.th.dest = th->source;
1134 rep.th.source = th->dest;
1135 rep.th.doff = arg.iov[0].iov_len/4;
1136 rep.th.seq = htonl(seq);
1137 rep.th.ack_seq = htonl(ack);
1138 rep.th.ack = 1;
1139 rep.th.window = htons(win);
1141 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1142 skb->nh.iph->saddr, /*XXX*/
1143 arg.iov[0].iov_len,
1144 IPPROTO_TCP,
1146 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1148 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1150 TCP_INC_STATS_BH(TcpOutSegs);
1153 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1155 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1157 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1158 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1160 tcp_tw_put(tw);
1163 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1165 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1166 req->ts_recent);
1169 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1171 struct rtable *rt;
1172 struct ip_options *opt;
1174 opt = req->af.v4_req.opt;
1175 if(ip_route_output(&rt, ((opt && opt->srr) ?
1176 opt->faddr :
1177 req->af.v4_req.rmt_addr),
1178 req->af.v4_req.loc_addr,
1179 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1180 sk->bound_dev_if)) {
1181 IP_INC_STATS_BH(IpOutNoRoutes);
1182 return NULL;
1184 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1185 ip_rt_put(rt);
1186 IP_INC_STATS_BH(IpOutNoRoutes);
1187 return NULL;
1189 return &rt->u.dst;
1193 * Send a SYN-ACK after having received an ACK.
1194 * This still operates on a open_request only, not on a big
1195 * socket.
1197 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1198 struct dst_entry *dst)
1200 int err = -1;
1201 struct sk_buff * skb;
1203 /* First, grab a route. */
1204 if (dst == NULL &&
1205 (dst = tcp_v4_route_req(sk, req)) == NULL)
1206 goto out;
1208 skb = tcp_make_synack(sk, dst, req);
1210 if (skb) {
1211 struct tcphdr *th = skb->h.th;
1213 th->check = tcp_v4_check(th, skb->len,
1214 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1215 csum_partial((char *)th, skb->len, skb->csum));
1217 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1218 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1219 if (err == NET_XMIT_CN)
1220 err = 0;
1223 out:
1224 dst_release(dst);
1225 return err;
1229 * IPv4 open_request destructor.
1231 static void tcp_v4_or_free(struct open_request *req)
1233 if (req->af.v4_req.opt)
1234 kfree(req->af.v4_req.opt);
1237 static inline void syn_flood_warning(struct sk_buff *skb)
1239 static unsigned long warntime;
1241 if (jiffies - warntime > HZ*60) {
1242 warntime = jiffies;
1243 printk(KERN_INFO
1244 "possible SYN flooding on port %d. Sending cookies.\n",
1245 ntohs(skb->h.th->dest));
1250 * Save and compile IPv4 options into the open_request if needed.
1252 static inline struct ip_options *
1253 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1255 struct ip_options *opt = &(IPCB(skb)->opt);
1256 struct ip_options *dopt = NULL;
1258 if (opt && opt->optlen) {
1259 int opt_size = optlength(opt);
1260 dopt = kmalloc(opt_size, GFP_ATOMIC);
1261 if (dopt) {
1262 if (ip_options_echo(dopt, skb)) {
1263 kfree(dopt);
1264 dopt = NULL;
1268 return dopt;
1272 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1273 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1274 * It would be better to replace it with a global counter for all sockets
1275 * but then some measure against one socket starving all other sockets
1276 * would be needed.
1278 * It was 128 by default. Experiments with real servers show, that
1279 * it is absolutely not enough even at 100conn/sec. 256 cures most
1280 * of problems. This value is adjusted to 128 for very small machines
1281 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1282 * Further increasing requires to change hash table size.
1284 int sysctl_max_syn_backlog = 256;
1286 struct or_calltable or_ipv4 = {
1287 PF_INET,
1288 tcp_v4_send_synack,
1289 tcp_v4_or_send_ack,
1290 tcp_v4_or_free,
1291 tcp_v4_send_reset
1294 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1296 struct tcp_opt tp;
1297 struct open_request *req;
1298 __u32 saddr = skb->nh.iph->saddr;
1299 __u32 daddr = skb->nh.iph->daddr;
1300 __u32 isn = TCP_SKB_CB(skb)->when;
1301 struct dst_entry *dst = NULL;
1302 #ifdef CONFIG_SYN_COOKIES
1303 int want_cookie = 0;
1304 #else
1305 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1306 #endif
1308 /* Never answer to SYNs send to broadcast or multicast */
1309 if (((struct rtable *)skb->dst)->rt_flags &
1310 (RTCF_BROADCAST|RTCF_MULTICAST))
1311 goto drop;
1313 /* TW buckets are converted to open requests without
1314 * limitations, they conserve resources and peer is
1315 * evidently real one.
1317 if (tcp_synq_is_full(sk) && !isn) {
1318 #ifdef CONFIG_SYN_COOKIES
1319 if (sysctl_tcp_syncookies) {
1320 want_cookie = 1;
1321 } else
1322 #endif
1323 goto drop;
1326 /* Accept backlog is full. If we have already queued enough
1327 * of warm entries in syn queue, drop request. It is better than
1328 * clogging syn queue with openreqs with exponentially increasing
1329 * timeout.
1331 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1332 goto drop;
1334 req = tcp_openreq_alloc();
1335 if (req == NULL)
1336 goto drop;
1338 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1339 tp.mss_clamp = 536;
1340 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1342 tcp_parse_options(skb, &tp);
1344 if (want_cookie) {
1345 tp.sack_ok = 0;
1346 tp.wscale_ok = 0;
1347 tp.snd_wscale = 0;
1348 tp.tstamp_ok = 0;
1349 tp.saw_tstamp = 0;
1352 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1353 /* Some OSes (unknown ones, but I see them on web server, which
1354 * contains information interesting only for windows'
1355 * users) do not send their stamp in SYN. It is easy case.
1356 * We simply do not advertise TS support.
1358 tp.saw_tstamp = 0;
1359 tp.tstamp_ok = 0;
1362 tcp_openreq_init(req, &tp, skb);
1364 req->af.v4_req.loc_addr = daddr;
1365 req->af.v4_req.rmt_addr = saddr;
1366 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1367 req->class = &or_ipv4;
1368 if (!want_cookie)
1369 TCP_ECN_create_request(req, skb->h.th);
1371 if (want_cookie) {
1372 #ifdef CONFIG_SYN_COOKIES
1373 syn_flood_warning(skb);
1374 #endif
1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376 } else if (isn == 0) {
1377 struct inet_peer *peer = NULL;
1379 /* VJ's idea. We save last timestamp seen
1380 * from the destination in peer table, when entering
1381 * state TIME-WAIT, and check against it before
1382 * accepting new connection request.
1384 * If "isn" is not zero, this request hit alive
1385 * timewait bucket, so that all the necessary checks
1386 * are made in the function processing timewait state.
1388 if (tp.saw_tstamp &&
1389 sysctl_tcp_tw_recycle &&
1390 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1391 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1392 peer->v4daddr == saddr) {
1393 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1394 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1395 NET_INC_STATS_BH(PAWSPassiveRejected);
1396 dst_release(dst);
1397 goto drop_and_free;
1400 /* Kill the following clause, if you dislike this way. */
1401 else if (!sysctl_tcp_syncookies &&
1402 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1403 < (sysctl_max_syn_backlog>>2)) &&
1404 (!peer || !peer->tcp_ts_stamp) &&
1405 (!dst || !dst->rtt)) {
1406 /* Without syncookies last quarter of
1407 * backlog is filled with destinations, proven to be alive.
1408 * It means that we continue to communicate
1409 * to destinations, already remembered
1410 * to the moment of synflood.
1412 NETDEBUG(if (net_ratelimit()) \
1413 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1414 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1415 TCP_INC_STATS_BH(TcpAttemptFails);
1416 dst_release(dst);
1417 goto drop_and_free;
1420 isn = tcp_v4_init_sequence(sk, skb);
1422 req->snt_isn = isn;
1424 if (tcp_v4_send_synack(sk, req, dst))
1425 goto drop_and_free;
1427 if (want_cookie) {
1428 tcp_openreq_free(req);
1429 } else {
1430 tcp_v4_synq_add(sk, req);
1432 return 0;
1434 drop_and_free:
1435 tcp_openreq_free(req);
1436 drop:
1437 TCP_INC_STATS_BH(TcpAttemptFails);
1438 return 0;
1443 * The three way handshake has completed - we got a valid synack -
1444 * now create the new socket.
1446 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1447 struct open_request *req,
1448 struct dst_entry *dst)
1450 struct tcp_opt *newtp;
1451 struct sock *newsk;
1453 if (tcp_acceptq_is_full(sk))
1454 goto exit_overflow;
1456 if (dst == NULL &&
1457 (dst = tcp_v4_route_req(sk, req)) == NULL)
1458 goto exit;
1460 newsk = tcp_create_openreq_child(sk, req, skb);
1461 if (!newsk)
1462 goto exit;
1464 newsk->dst_cache = dst;
1466 newtp = &(newsk->tp_pinfo.af_tcp);
1467 newsk->daddr = req->af.v4_req.rmt_addr;
1468 newsk->saddr = req->af.v4_req.loc_addr;
1469 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1470 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1471 req->af.v4_req.opt = NULL;
1472 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1473 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1474 newtp->ext_header_len = 0;
1475 if (newsk->protinfo.af_inet.opt)
1476 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1478 tcp_sync_mss(newsk, dst->pmtu);
1479 newtp->advmss = dst->advmss;
1480 tcp_initialize_rcv_mss(newsk);
1482 __tcp_v4_hash(newsk);
1483 __tcp_inherit_port(sk, newsk);
1485 return newsk;
1487 exit_overflow:
1488 NET_INC_STATS_BH(ListenOverflows);
1489 exit:
1490 NET_INC_STATS_BH(ListenDrops);
1491 dst_release(dst);
1492 return NULL;
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1497 struct open_request *req, **prev;
1498 struct tcphdr *th = skb->h.th;
1499 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1500 struct sock *nsk;
1502 /* Find possible connection requests. */
1503 req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1504 if (req)
1505 return tcp_check_req(sk, skb, req, prev);
1507 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1508 th->source,
1509 skb->nh.iph->daddr,
1510 ntohs(th->dest),
1511 tcp_v4_iif(skb));
1513 if (nsk) {
1514 if (nsk->state != TCP_TIME_WAIT) {
1515 bh_lock_sock(nsk);
1516 return nsk;
1518 tcp_tw_put((struct tcp_tw_bucket*)sk);
1519 return NULL;
1522 #ifdef CONFIG_SYN_COOKIES
1523 if (!th->rst && !th->syn && th->ack)
1524 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1525 #endif
1526 return sk;
1529 static int tcp_v4_checksum_init(struct sk_buff *skb)
1531 if (skb->ip_summed == CHECKSUM_HW) {
1532 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1533 skb->nh.iph->daddr,skb->csum)) {
1534 NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1535 return -1;
1537 skb->ip_summed = CHECKSUM_UNNECESSARY;
1538 } else {
1539 if (skb->len <= 76) {
1540 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1541 skb->nh.iph->daddr,
1542 csum_partial((char *)skb->h.th, skb->len, 0)))
1543 return -1;
1544 skb->ip_summed = CHECKSUM_UNNECESSARY;
1545 } else {
1546 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1547 skb->nh.iph->daddr,0);
1550 return 0;
1554 /* The socket must have it's spinlock held when we get
1555 * here.
1557 * We have a potential double-lock case here, so even when
1558 * doing backlog processing we use the BH locking scheme.
1559 * This is because we cannot sleep with the original spinlock
1560 * held.
1562 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1564 #ifdef CONFIG_FILTER
1565 struct sk_filter *filter = sk->filter;
1566 if (filter && sk_filter(skb, filter))
1567 goto discard;
1568 #endif /* CONFIG_FILTER */
1570 IP_INC_STATS_BH(IpInDelivers);
1572 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1573 TCP_CHECK_TIMER(sk);
1574 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1575 goto reset;
1576 TCP_CHECK_TIMER(sk);
1577 return 0;
1580 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1581 goto csum_err;
1583 if (sk->state == TCP_LISTEN) {
1584 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1585 if (!nsk)
1586 goto discard;
1588 if (nsk != sk) {
1589 if (tcp_child_process(sk, nsk, skb))
1590 goto reset;
1591 return 0;
1595 TCP_CHECK_TIMER(sk);
1596 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1597 goto reset;
1598 TCP_CHECK_TIMER(sk);
1599 return 0;
1601 reset:
1602 tcp_v4_send_reset(skb);
1603 discard:
1604 kfree_skb(skb);
1605 /* Be careful here. If this function gets more complicated and
1606 * gcc suffers from register pressure on the x86, sk (in %ebx)
1607 * might be destroyed here. This current version compiles correctly,
1608 * but you have been warned.
1610 return 0;
1612 csum_err:
1613 TCP_INC_STATS_BH(TcpInErrs);
1614 goto discard;
1618 * From tcp_input.c
1621 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1623 struct tcphdr *th;
1624 struct sock *sk;
1625 int ret;
1627 if (skb->pkt_type!=PACKET_HOST)
1628 goto discard_it;
1630 th = skb->h.th;
1632 /* Pull up the IP header. */
1633 __skb_pull(skb, skb->h.raw - skb->data);
1635 /* Count it even if it's bad */
1636 TCP_INC_STATS_BH(TcpInSegs);
1638 /* An explanation is required here, I think.
1639 * Packet length and doff are validated by header prediction,
1640 * provided case of th->doff==0 is elimineted.
1641 * So, we defer the checks. */
1642 if (th->doff < sizeof(struct tcphdr)/4 ||
1643 (skb->ip_summed != CHECKSUM_UNNECESSARY &&
1644 tcp_v4_checksum_init(skb) < 0))
1645 goto bad_packet;
1647 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649 len - th->doff*4);
1650 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651 TCP_SKB_CB(skb)->when = 0;
1652 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1653 TCP_SKB_CB(skb)->sacked = 0;
1654 skb->used = 0;
1656 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1657 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1659 if (!sk)
1660 goto no_tcp_socket;
1662 process:
1663 if(!ipsec_sk_policy(sk,skb))
1664 goto discard_and_relse;
1666 if (sk->state == TCP_TIME_WAIT)
1667 goto do_time_wait;
1669 bh_lock_sock(sk);
1670 ret = 0;
1671 if (!sk->lock.users) {
1672 if (!tcp_prequeue(sk, skb))
1673 ret = tcp_v4_do_rcv(sk, skb);
1674 } else
1675 sk_add_backlog(sk, skb);
1676 bh_unlock_sock(sk);
1678 sock_put(sk);
1680 return ret;
1682 no_tcp_socket:
1683 if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1684 bad_packet:
1685 TCP_INC_STATS_BH(TcpInErrs);
1686 } else {
1687 tcp_v4_send_reset(skb);
1690 discard_it:
1691 /* Discard frame. */
1692 kfree_skb(skb);
1693 return 0;
1695 discard_and_relse:
1696 sock_put(sk);
1697 goto discard_it;
1699 do_time_wait:
1700 if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1701 TCP_INC_STATS_BH(TcpInErrs);
1702 goto discard_and_relse;
1704 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1705 skb, th, skb->len)) {
1706 case TCP_TW_SYN:
1708 struct sock *sk2;
1710 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1711 if (sk2 != NULL) {
1712 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1713 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1714 tcp_tw_put((struct tcp_tw_bucket *)sk);
1715 sk = sk2;
1716 goto process;
1718 /* Fall through to ACK */
1720 case TCP_TW_ACK:
1721 tcp_v4_timewait_ack(sk, skb);
1722 break;
1723 case TCP_TW_RST:
1724 goto no_tcp_socket;
1725 case TCP_TW_SUCCESS:
1727 goto discard_it;
1730 /* With per-bucket locks this operation is not-atomic, so that
1731 * this version is not worse.
1733 static void __tcp_v4_rehash(struct sock *sk)
1735 sk->prot->unhash(sk);
1736 sk->prot->hash(sk);
1739 int tcp_v4_rebuild_header(struct sock *sk)
1741 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1742 __u32 new_saddr;
1743 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT &&
1744 !(sk->userlocks & SOCK_BINDADDR_LOCK);
1746 if (rt == NULL) {
1747 int err;
1749 u32 daddr = sk->daddr;
1751 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1752 daddr = sk->protinfo.af_inet.opt->faddr;
1754 err = ip_route_output(&rt, daddr, sk->saddr,
1755 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1756 sk->bound_dev_if);
1757 if (err) {
1758 sk->err_soft=-err;
1759 sk->error_report(sk);
1760 return -1;
1762 __sk_dst_set(sk, &rt->u.dst);
1765 /* Force route checking if want_rewrite. */
1766 if (want_rewrite) {
1767 int tmp;
1768 struct rtable *new_rt;
1769 __u32 old_saddr = rt->rt_src;
1771 /* Query new route using another rt buffer */
1772 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1773 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1774 sk->bound_dev_if);
1776 /* Only useful if different source addrs */
1777 if (tmp == 0) {
1779 * Only useful if different source addrs
1781 if (new_rt->rt_src != old_saddr ) {
1782 __sk_dst_set(sk, &new_rt->u.dst);
1783 rt = new_rt;
1784 goto do_rewrite;
1786 dst_release(&new_rt->u.dst);
1790 return 0;
1792 do_rewrite:
1793 new_saddr = rt->rt_src;
1795 /* Ouch!, this should not happen. */
1796 if (!sk->saddr || !sk->rcv_saddr) {
1797 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1798 "saddr=%08X rcv_saddr=%08X\n",
1799 ntohl(sk->saddr),
1800 ntohl(sk->rcv_saddr));
1801 return -1;
1804 if (new_saddr != sk->saddr) {
1805 if (sysctl_ip_dynaddr > 1) {
1806 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1807 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1808 NIPQUAD(sk->saddr),
1809 NIPQUAD(new_saddr));
1812 sk->saddr = new_saddr;
1813 sk->rcv_saddr = new_saddr;
1815 /* XXX The only one ugly spot where we need to
1816 * XXX really change the sockets identity after
1817 * XXX it has entered the hashes. -DaveM
1819 * Besides that, it does not check for connection
1820 * uniqueness. Wait for troubles.
1822 __tcp_v4_rehash(sk);
1825 return 0;
1828 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1830 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1832 sin->sin_family = AF_INET;
1833 sin->sin_addr.s_addr = sk->daddr;
1834 sin->sin_port = sk->dport;
1837 /* VJ's idea. Save last timestamp seen from this destination
1838 * and hold it at least for normal timewait interval to use for duplicate
1839 * segment detection in subsequent connections, before they enter synchronized
1840 * state.
1843 int tcp_v4_remember_stamp(struct sock *sk)
1845 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1846 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1847 struct inet_peer *peer = NULL;
1848 int release_it = 0;
1850 if (rt == NULL || rt->rt_dst != sk->daddr) {
1851 peer = inet_getpeer(sk->daddr, 1);
1852 release_it = 1;
1853 } else {
1854 if (rt->peer == NULL)
1855 rt_bind_peer(rt, 1);
1856 peer = rt->peer;
1859 if (peer) {
1860 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1861 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1862 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1863 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1864 peer->tcp_ts = tp->ts_recent;
1866 if (release_it)
1867 inet_putpeer(peer);
1868 return 1;
1871 return 0;
1874 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1876 struct inet_peer *peer = NULL;
1878 peer = inet_getpeer(tw->daddr, 1);
1880 if (peer) {
1881 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1882 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1883 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1884 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1885 peer->tcp_ts = tw->ts_recent;
1887 inet_putpeer(peer);
1888 return 1;
1891 return 0;
1894 struct tcp_func ipv4_specific = {
1895 ip_queue_xmit,
1896 tcp_v4_send_check,
1897 tcp_v4_rebuild_header,
1898 tcp_v4_conn_request,
1899 tcp_v4_syn_recv_sock,
1900 tcp_v4_hash_connecting,
1901 tcp_v4_remember_stamp,
1902 sizeof(struct iphdr),
1904 ip_setsockopt,
1905 ip_getsockopt,
1906 v4_addr2sockaddr,
1907 sizeof(struct sockaddr_in)
1910 /* NOTE: A lot of things set to zero explicitly by call to
1911 * sk_alloc() so need not be done here.
1913 static int tcp_v4_init_sock(struct sock *sk)
1915 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1917 skb_queue_head_init(&tp->out_of_order_queue);
1918 tcp_init_xmit_timers(sk);
1919 tcp_prequeue_init(tp);
1921 tp->rto = TCP_TIMEOUT_INIT;
1922 tp->mdev = TCP_TIMEOUT_INIT;
1924 /* So many TCP implementations out there (incorrectly) count the
1925 * initial SYN frame in their delayed-ACK and congestion control
1926 * algorithms that we must have the following bandaid to talk
1927 * efficiently to them. -DaveM
1929 tp->snd_cwnd = 2;
1931 /* See draft-stevens-tcpca-spec-01 for discussion of the
1932 * initialization of these values.
1934 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1935 tp->snd_cwnd_clamp = ~0;
1936 tp->mss_cache = 536;
1938 tp->reordering = sysctl_tcp_reordering;
1940 sk->state = TCP_CLOSE;
1942 sk->write_space = tcp_write_space;
1944 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1946 sk->sndbuf = sysctl_tcp_wmem[1];
1947 sk->rcvbuf = sysctl_tcp_rmem[1];
1949 atomic_inc(&tcp_sockets_allocated);
1951 return 0;
1954 static int tcp_v4_destroy_sock(struct sock *sk)
1956 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1958 tcp_clear_xmit_timers(sk);
1960 /* Cleanup up the write buffer. */
1961 tcp_writequeue_purge(sk);
1963 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1964 __skb_queue_purge(&tp->out_of_order_queue);
1966 /* Clean prequeue, it must be empty really */
1967 __skb_queue_purge(&tp->ucopy.prequeue);
1969 /* Clean up a referenced TCP bind bucket. */
1970 if(sk->prev != NULL)
1971 tcp_put_port(sk);
1973 atomic_dec(&tcp_sockets_allocated);
1975 return 0;
1978 /* Proc filesystem TCP sock list dumping. */
1979 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1981 int ttd = req->expires - jiffies;
1983 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1984 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1986 req->af.v4_req.loc_addr,
1987 ntohs(sk->sport),
1988 req->af.v4_req.rmt_addr,
1989 ntohs(req->rmt_port),
1990 TCP_SYN_RECV,
1991 0,0, /* could print option size, but that is af dependent. */
1992 1, /* timers active (only the expire timer) */
1993 ttd,
1994 req->retrans,
1995 uid,
1996 0, /* non standard timer */
1997 0, /* open_requests have no inode */
1998 atomic_read(&sk->refcnt),
2003 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2005 unsigned int dest, src;
2006 __u16 destp, srcp;
2007 int timer_active;
2008 unsigned long timer_expires;
2009 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2011 dest = sp->daddr;
2012 src = sp->rcv_saddr;
2013 destp = ntohs(sp->dport);
2014 srcp = ntohs(sp->sport);
2015 if (tp->pending == TCP_TIME_RETRANS) {
2016 timer_active = 1;
2017 timer_expires = tp->timeout;
2018 } else if (tp->pending == TCP_TIME_PROBE0) {
2019 timer_active = 4;
2020 timer_expires = tp->timeout;
2021 } else if (timer_pending(&sp->timer)) {
2022 timer_active = 2;
2023 timer_expires = sp->timer.expires;
2024 } else {
2025 timer_active = 0;
2026 timer_expires = jiffies;
2029 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2030 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2031 i, src, srcp, dest, destp, sp->state,
2032 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2033 timer_active, timer_expires-jiffies,
2034 tp->retransmits,
2035 sock_i_uid(sp),
2036 tp->probes_out,
2037 sock_i_ino(sp),
2038 atomic_read(&sp->refcnt), sp,
2039 tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong, sp->sndbuf
2043 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2045 unsigned int dest, src;
2046 __u16 destp, srcp;
2047 int ttd = tw->ttd - jiffies;
2049 if (ttd < 0)
2050 ttd = 0;
2052 dest = tw->daddr;
2053 src = tw->rcv_saddr;
2054 destp = ntohs(tw->dport);
2055 srcp = ntohs(tw->sport);
2057 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2058 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2059 i, src, srcp, dest, destp, tw->substate, 0, 0,
2060 3, ttd, 0, 0, 0, 0,
2061 atomic_read(&tw->refcnt), tw);
2064 #define TMPSZ 150
2066 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2068 int len = 0, num = 0, i;
2069 off_t begin, pos = 0;
2070 char tmpbuf[TMPSZ+1];
2072 if (offset < TMPSZ)
2073 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2074 " sl local_address rem_address st tx_queue "
2075 "rx_queue tr tm->when retrnsmt uid timeout inode");
2077 pos = TMPSZ;
2079 /* First, walk listening socket table. */
2080 tcp_listen_lock();
2081 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2082 struct sock *sk = tcp_listening_hash[i];
2083 struct tcp_listen_opt *lopt;
2084 int k;
2086 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2087 struct open_request *req;
2088 int uid;
2089 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2091 if (!TCP_INET_FAMILY(sk->family))
2092 goto skip_listen;
2094 pos += TMPSZ;
2095 if (pos >= offset) {
2096 get_tcp_sock(sk, tmpbuf, num);
2097 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2098 if (len >= length) {
2099 tcp_listen_unlock();
2100 goto out_no_bh;
2104 skip_listen:
2105 uid = sock_i_uid(sk);
2106 read_lock_bh(&tp->syn_wait_lock);
2107 lopt = tp->listen_opt;
2108 if (lopt && lopt->qlen != 0) {
2109 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2110 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2111 if (!TCP_INET_FAMILY(req->class->family))
2112 continue;
2114 pos += TMPSZ;
2115 if (pos < offset)
2116 continue;
2117 get_openreq(sk, req, tmpbuf, num, uid);
2118 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2119 if(len >= length) {
2120 read_unlock_bh(&tp->syn_wait_lock);
2121 tcp_listen_unlock();
2122 goto out_no_bh;
2127 read_unlock_bh(&tp->syn_wait_lock);
2129 /* Completed requests are in normal socket hash table */
2132 tcp_listen_unlock();
2134 local_bh_disable();
2136 /* Next, walk established hash chain. */
2137 for (i = 0; i < tcp_ehash_size; i++) {
2138 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2139 struct sock *sk;
2140 struct tcp_tw_bucket *tw;
2142 read_lock(&head->lock);
2143 for(sk = head->chain; sk; sk = sk->next, num++) {
2144 if (!TCP_INET_FAMILY(sk->family))
2145 continue;
2146 pos += TMPSZ;
2147 if (pos < offset)
2148 continue;
2149 get_tcp_sock(sk, tmpbuf, num);
2150 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2151 if(len >= length) {
2152 read_unlock(&head->lock);
2153 goto out;
2156 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2157 tw != NULL;
2158 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2159 if (!TCP_INET_FAMILY(tw->family))
2160 continue;
2161 pos += TMPSZ;
2162 if (pos < offset)
2163 continue;
2164 get_timewait_sock(tw, tmpbuf, num);
2165 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2166 if(len >= length) {
2167 read_unlock(&head->lock);
2168 goto out;
2171 read_unlock(&head->lock);
2174 out:
2175 local_bh_enable();
2176 out_no_bh:
2178 begin = len - (pos - offset);
2179 *start = buffer + begin;
2180 len -= begin;
2181 if(len > length)
2182 len = length;
2183 if (len < 0)
2184 len = 0;
2185 return len;
2188 struct proto tcp_prot = {
2189 name: "TCP",
2190 close: tcp_close,
2191 connect: tcp_v4_connect,
2192 disconnect: tcp_disconnect,
2193 accept: tcp_accept,
2194 ioctl: tcp_ioctl,
2195 init: tcp_v4_init_sock,
2196 destroy: tcp_v4_destroy_sock,
2197 shutdown: tcp_shutdown,
2198 setsockopt: tcp_setsockopt,
2199 getsockopt: tcp_getsockopt,
2200 sendmsg: tcp_sendmsg,
2201 recvmsg: tcp_recvmsg,
2202 backlog_rcv: tcp_v4_do_rcv,
2203 hash: tcp_v4_hash,
2204 unhash: tcp_unhash,
2205 get_port: tcp_v4_get_port,
2210 void __init tcp_v4_init(struct net_proto_family *ops)
2212 int err;
2214 tcp_inode.i_mode = S_IFSOCK;
2215 tcp_inode.i_sock = 1;
2216 tcp_inode.i_uid = 0;
2217 tcp_inode.i_gid = 0;
2218 init_waitqueue_head(&tcp_inode.i_wait);
2219 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2221 tcp_socket->inode = &tcp_inode;
2222 tcp_socket->state = SS_UNCONNECTED;
2223 tcp_socket->type=SOCK_RAW;
2225 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2226 panic("Failed to create the TCP control socket.\n");
2227 tcp_socket->sk->allocation=GFP_ATOMIC;
2228 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2230 /* Unhash it so that IP input processing does not even
2231 * see it, we do not wish this socket to see incoming
2232 * packets.
2234 tcp_socket->sk->prot->unhash(tcp_socket->sk);