Import 2.3.18pre1
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blob986868b4fc8fb0bba4d0c742db707946c3cf0bb5
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.189 1999/09/07 02:31:33 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
55 #include <linux/ipsec.h>
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
60 #include <net/inet_common.h>
62 #include <asm/segment.h>
64 #include <linux/inet.h>
65 #include <linux/stddef.h>
67 extern int sysctl_tcp_timestamps;
68 extern int sysctl_tcp_window_scaling;
69 extern int sysctl_tcp_sack;
70 extern int sysctl_tcp_syncookies;
71 extern int sysctl_tcp_tw_recycle;
72 extern int sysctl_ip_dynaddr;
73 extern __u32 sysctl_wmem_max;
74 extern __u32 sysctl_rmem_max;
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
79 /* Socket used for sending RSTs */
80 struct inode tcp_inode;
81 struct socket *tcp_socket=&tcp_inode.u.socket_i;
83 static void tcp_v4_send_reset(struct sk_buff *skb);
85 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
86 struct sk_buff *skb);
88 /* This is for sockets with full identity only. Sockets here will always
89 * be without wildcards and will have the following invariant:
90 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
92 * First half of the table is for sockets not in TIME_WAIT, second half
93 * is for TIME_WAIT sockets only.
95 struct tcp_ehash_bucket *tcp_ehash = NULL;
97 /* Ok, let's try this, I give up, we do need a local binding
98 * TCP hash as well as the others for fast bind/connect.
100 struct tcp_bind_hashbucket *tcp_bhash = NULL;
102 int tcp_bhash_size = 0;
103 int tcp_ehash_size = 0;
105 /* All sockets in TCP_LISTEN state will be in here. This is the only table
106 * where wildcard'd TCP sockets can exist. Hash function here is just local
107 * port number.
109 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
110 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
111 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
112 (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
114 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
115 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
116 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
118 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
121 * This array holds the first and last local port number.
122 * For high-usage systems, use sysctl to change this to
123 * 32768-61000
125 int sysctl_local_port_range[2] = { 1024, 4999 };
126 int tcp_port_rover = (1024 - 1);
128 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
129 __u32 faddr, __u16 fport)
131 int h = ((laddr ^ lport) ^ (faddr ^ fport));
132 h ^= h>>16;
133 h ^= h>>8;
134 return h & (tcp_ehash_size - 1);
137 static __inline__ int tcp_sk_hashfn(struct sock *sk)
139 __u32 laddr = sk->rcv_saddr;
140 __u16 lport = sk->num;
141 __u32 faddr = sk->daddr;
142 __u16 fport = sk->dport;
144 return tcp_hashfn(laddr, lport, faddr, fport);
147 /* Allocate and initialize a new TCP local port bind bucket.
148 * The bindhash mutex for snum's hash chain must be held here.
150 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
151 unsigned short snum)
153 struct tcp_bind_bucket *tb;
155 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
156 if(tb != NULL) {
157 tb->port = snum;
158 tb->fastreuse = 0;
159 tb->owners = NULL;
160 if((tb->next = head->chain) != NULL)
161 tb->next->pprev = &tb->next;
162 head->chain = tb;
163 tb->pprev = &head->chain;
165 return tb;
168 /* Caller must disable local BH processing. */
169 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
171 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
172 struct tcp_bind_bucket *tb;
174 spin_lock(&head->lock);
175 tb = (struct tcp_bind_bucket *)sk->prev;
176 if ((child->bind_next = tb->owners) != NULL)
177 tb->owners->bind_pprev = &child->bind_next;
178 tb->owners = child;
179 child->bind_pprev = &tb->owners;
180 child->prev = (struct sock *) tb;
181 spin_unlock(&head->lock);
184 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
186 local_bh_disable();
187 __tcp_inherit_port(sk, child);
188 local_bh_enable();
191 /* Obtain a reference to a local port for the given sock,
192 * if snum is zero it means select any available local port.
194 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
196 struct tcp_bind_hashbucket *head;
197 struct tcp_bind_bucket *tb;
198 int ret;
200 local_bh_disable();
201 if (snum == 0) {
202 int low = sysctl_local_port_range[0];
203 int high = sysctl_local_port_range[1];
204 int remaining = (high - low) + 1;
205 int rover;
207 spin_lock(&tcp_portalloc_lock);
208 rover = tcp_port_rover;
209 do { rover++;
210 if ((rover < low) || (rover > high))
211 rover = low;
212 head = &tcp_bhash[tcp_bhashfn(rover)];
213 spin_lock(&head->lock);
214 for (tb = head->chain; tb; tb = tb->next)
215 if (tb->port == rover)
216 goto next;
217 break;
218 next:
219 spin_unlock(&head->lock);
220 } while (--remaining > 0);
221 tcp_port_rover = rover;
222 spin_unlock(&tcp_portalloc_lock);
224 /* Exhausted local port range during search? */
225 ret = 1;
226 if (remaining <= 0)
227 goto fail;
229 /* OK, here is the one we will use. HEAD is
230 * non-NULL and we hold it's mutex.
232 snum = rover;
233 tb = NULL;
234 } else {
235 head = &tcp_bhash[tcp_bhashfn(snum)];
236 spin_lock(&head->lock);
237 for (tb = head->chain; tb != NULL; tb = tb->next)
238 if (tb->port == snum)
239 break;
241 if (tb != NULL && tb->owners != NULL) {
242 if (tb->fastreuse != 0 && sk->reuse != 0) {
243 goto success;
244 } else {
245 struct sock *sk2 = tb->owners;
246 int sk_reuse = sk->reuse;
248 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
249 if (sk->bound_dev_if == sk2->bound_dev_if) {
250 if (!sk_reuse ||
251 !sk2->reuse ||
252 sk2->state == TCP_LISTEN) {
253 if (!sk2->rcv_saddr ||
254 !sk->rcv_saddr ||
255 (sk2->rcv_saddr == sk->rcv_saddr))
256 break;
260 /* If we found a conflict, fail. */
261 ret = 1;
262 if (sk2 != NULL)
263 goto fail_unlock;
266 ret = 1;
267 if (tb == NULL &&
268 (tb = tcp_bucket_create(head, snum)) == NULL)
269 goto fail_unlock;
270 if (tb->owners == NULL) {
271 if (sk->reuse && sk->state != TCP_LISTEN)
272 tb->fastreuse = 1;
273 else
274 tb->fastreuse = 0;
275 } else if (tb->fastreuse &&
276 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
277 tb->fastreuse = 0;
278 success:
279 sk->num = snum;
280 if ((sk->bind_next = tb->owners) != NULL)
281 tb->owners->bind_pprev = &sk->bind_next;
282 tb->owners = sk;
283 sk->bind_pprev = &tb->owners;
284 sk->prev = (struct sock *) tb;
285 ret = 0;
287 fail_unlock:
288 spin_unlock(&head->lock);
289 fail:
290 local_bh_enable();
291 return ret;
294 /* Get rid of any references to a local port held by the
295 * given sock.
297 __inline__ void __tcp_put_port(struct sock *sk)
299 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300 struct tcp_bind_bucket *tb;
302 spin_lock(&head->lock);
303 tb = (struct tcp_bind_bucket *) sk->prev;
304 if (sk->bind_next)
305 sk->bind_next->bind_pprev = sk->bind_pprev;
306 *(sk->bind_pprev) = sk->bind_next;
307 sk->prev = NULL;
308 if (tb->owners == NULL) {
309 if (tb->next)
310 tb->next->pprev = tb->pprev;
311 *(tb->pprev) = tb->next;
312 kmem_cache_free(tcp_bucket_cachep, tb);
314 spin_unlock(&head->lock);
317 void tcp_put_port(struct sock *sk)
319 local_bh_disable();
320 __tcp_put_port(sk);
321 local_bh_enable();
324 #ifdef CONFIG_TCP_TW_RECYCLE
326 Very stupid pseudo-"algoritm". If the approach will be successful
327 (and it will!), we have to make it more reasonable.
328 Now it eats lots of CPU, when we are tough on ports.
330 Apparently, it should be hash table indexed by daddr/dport.
332 How does it work? We allow to truncate time-wait state, if:
333 1. PAWS works on it.
334 2. timewait bucket did not receive data for timeout:
335 - initially timeout := 2*RTO, so that if our ACK to first
336 transmitted peer's FIN is lost, we will see first retransmit.
337 - if we receive anything, the timout is increased exponentially
338 to follow normal TCP backoff pattern.
339 It is important that minimal RTO (HZ/5) > minimal timestamp
340 step (1ms).
341 3. When creating new socket, we inherit sequence number
342 and ts_recent of time-wait bucket, increasinf them a bit.
344 These two conditions guarantee, that data will not be corrupted
345 both by retransmitted and by delayed segments. They do not guarantee
346 that peer will leave LAST-ACK/CLOSING state gracefully, it will be
347 reset sometimes, namely, when more than two our ACKs to its FINs are lost.
348 This reset is harmless and even good.
351 int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
353 static int tw_rover;
355 struct tcp_tw_bucket *tw;
356 struct tcp_bind_hashbucket *head;
357 struct tcp_bind_bucket *tb;
359 int low = sysctl_local_port_range[0];
360 int high = sysctl_local_port_range[1];
361 unsigned long now = jiffies;
362 int i, rover;
364 rover = tw_rover;
366 local_bh_disable();
367 for (i=0; i<tcp_bhash_size; i++, rover++) {
368 rover &= (tcp_bhash_size-1);
369 head = &tcp_bhash[rover];
371 spin_lock(&head->lock);
372 for (tb = head->chain; tb; tb = tb->next) {
373 tw = (struct tcp_tw_bucket*)tb->owners;
375 if (tw->state != TCP_TIME_WAIT ||
376 tw->dport != dport ||
377 tw->daddr != daddr ||
378 tw->rcv_saddr != sk->rcv_saddr ||
379 tb->port < low ||
380 tb->port >= high ||
381 !TCP_INET_FAMILY(tw->family) ||
382 tw->ts_recent_stamp == 0 ||
383 (long)(now - tw->ttd) <= 0)
384 continue;
385 tw_rover = rover;
386 goto hit;
388 spin_unlock(&head->lock);
390 local_bh_enable();
391 tw_rover = rover;
392 return -EAGAIN;
394 hit:
395 sk->num = tw->num;
396 if ((sk->bind_next = tb->owners) != NULL)
397 tb->owners->bind_pprev = &sk->bind_next;
398 tb->owners = sk;
399 sk->bind_pprev = &tb->owners;
400 sk->prev = (struct sock *) tb;
401 spin_unlock_bh(&head->lock);
402 return 0;
404 #endif
407 void tcp_listen_wlock(void)
409 write_lock(&tcp_lhash_lock);
411 if (atomic_read(&tcp_lhash_users)) {
412 DECLARE_WAITQUEUE(wait, current);
414 add_wait_queue(&tcp_lhash_wait, &wait);
415 for (;;) {
416 set_current_state(TASK_UNINTERRUPTIBLE);
417 if (atomic_read(&tcp_lhash_users) == 0)
418 break;
419 write_unlock_bh(&tcp_lhash_lock);
420 schedule();
421 write_lock_bh(&tcp_lhash_lock);
424 __set_current_state(TASK_RUNNING);
425 remove_wait_queue(&tcp_lhash_wait, &wait);
429 static __inline__ void __tcp_v4_hash(struct sock *sk)
431 struct sock **skp;
432 rwlock_t *lock;
434 BUG_TRAP(sk->pprev==NULL);
435 if(sk->state == TCP_LISTEN) {
436 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
437 lock = &tcp_lhash_lock;
438 tcp_listen_wlock();
439 } else {
440 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
441 lock = &tcp_ehash[sk->hashent].lock;
442 write_lock(lock);
444 if((sk->next = *skp) != NULL)
445 (*skp)->pprev = &sk->next;
446 *skp = sk;
447 sk->pprev = skp;
448 sk->prot->inuse++;
449 if(sk->prot->highestinuse < sk->prot->inuse)
450 sk->prot->highestinuse = sk->prot->inuse;
451 write_unlock(lock);
454 static void tcp_v4_hash(struct sock *sk)
456 if (sk->state != TCP_CLOSE) {
457 local_bh_disable();
458 __tcp_v4_hash(sk);
459 local_bh_enable();
463 void tcp_unhash(struct sock *sk)
465 rwlock_t *lock;
467 if (sk->state == TCP_LISTEN) {
468 local_bh_disable();
469 tcp_listen_wlock();
470 lock = &tcp_lhash_lock;
471 } else {
472 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
473 lock = &head->lock;
474 write_lock_bh(&head->lock);
477 if(sk->pprev) {
478 if(sk->next)
479 sk->next->pprev = sk->pprev;
480 *sk->pprev = sk->next;
481 sk->pprev = NULL;
482 sk->prot->inuse--;
484 write_unlock_bh(lock);
487 /* Don't inline this cruft. Here are some nice properties to
488 * exploit here. The BSD API does not allow a listening TCP
489 * to specify the remote port nor the remote address for the
490 * connection. So always assume those are both wildcarded
491 * during the search since they can never be otherwise.
493 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
495 struct sock *result = NULL;
496 int score, hiscore;
498 hiscore=0;
499 for(; sk; sk = sk->next) {
500 if(sk->num == hnum) {
501 __u32 rcv_saddr = sk->rcv_saddr;
503 score = 1;
504 if(rcv_saddr) {
505 if (rcv_saddr != daddr)
506 continue;
507 score++;
509 if (sk->bound_dev_if) {
510 if (sk->bound_dev_if != dif)
511 continue;
512 score++;
514 if (score == 3)
515 return sk;
516 if (score > hiscore) {
517 hiscore = score;
518 result = sk;
522 return result;
525 /* Optimize the common listener case. */
526 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
528 struct sock *sk;
530 read_lock(&tcp_lhash_lock);
531 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
532 if (sk) {
533 if (sk->num == hnum && sk->next == NULL)
534 goto sherry_cache;
535 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
537 if (sk) {
538 sherry_cache:
539 sock_hold(sk);
541 read_unlock(&tcp_lhash_lock);
542 return sk;
545 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
546 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
548 * Local BH must be disabled here.
550 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
551 u32 daddr, u16 hnum, int dif)
553 struct tcp_ehash_bucket *head;
554 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
555 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
556 struct sock *sk;
557 int hash;
559 /* Optimize here for direct hit, only listening connections can
560 * have wildcards anyways.
562 hash = tcp_hashfn(daddr, hnum, saddr, sport);
563 head = &tcp_ehash[hash];
564 read_lock(&head->lock);
565 for(sk = head->chain; sk; sk = sk->next) {
566 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
567 goto hit; /* You sunk my battleship! */
570 /* Must check for a TIME_WAIT'er before going to listener hash. */
571 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
572 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
573 goto hit;
574 read_unlock(&head->lock);
576 return tcp_v4_lookup_listener(daddr, hnum, dif);
578 hit:
579 sock_hold(sk);
580 read_unlock(&head->lock);
581 return sk;
584 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
586 struct sock *sk;
588 local_bh_disable();
589 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
590 local_bh_enable();
592 return sk;
595 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
597 return secure_tcp_sequence_number(sk->saddr, sk->daddr,
598 skb->h.th->dest,
599 skb->h.th->source);
602 static int tcp_v4_check_established(struct sock *sk)
604 u32 daddr = sk->rcv_saddr;
605 u32 saddr = sk->daddr;
606 int dif = sk->bound_dev_if;
607 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
608 __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
609 int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
610 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
611 struct sock *sk2, **skp;
612 #ifdef CONFIG_TCP_TW_RECYCLE
613 struct tcp_tw_bucket *tw;
614 #endif
616 write_lock_bh(&head->lock);
618 /* Check TIME-WAIT sockets first. */
619 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
620 skp = &sk2->next) {
621 #ifdef CONFIG_TCP_TW_RECYCLE
622 tw = (struct tcp_tw_bucket*)sk2;
623 #endif
625 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
626 #ifdef CONFIG_TCP_TW_RECYCLE
627 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
629 /* With PAWS, it is safe from the viewpoint
630 of data integrity. Even without PAWS it
631 is safe provided sequence spaces do not
632 overlap i.e. at data rates <= 80Mbit/sec.
634 Actually, the idea is close to VJ's (rfc1332)
635 one, only timestamp cache is held not per host,
636 but per port pair and TW bucket is used
637 as state holder.
639 if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
640 if ((tp->write_seq = tw->snd_nxt + 2) == 0)
641 tp->write_seq = 1;
642 tp->ts_recent = tw->ts_recent;
643 tp->ts_recent_stamp = tw->ts_recent_stamp;
644 sock_hold(sk2);
645 skp = &head->chain;
646 goto unique;
647 } else
648 #endif
649 goto not_unique;
652 #ifdef CONFIG_TCP_TW_RECYCLE
653 tw = NULL;
654 #endif
656 /* And established part... */
657 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
658 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
659 goto not_unique;
662 #ifdef CONFIG_TCP_TW_RECYCLE
663 unique:
664 #endif
665 BUG_TRAP(sk->pprev==NULL);
666 if ((sk->next = *skp) != NULL)
667 (*skp)->pprev = &sk->next;
669 *skp = sk;
670 sk->pprev = skp;
671 sk->prot->inuse++;
672 if(sk->prot->highestinuse < sk->prot->inuse)
673 sk->prot->highestinuse = sk->prot->inuse;
674 write_unlock_bh(&head->lock);
676 #ifdef CONFIG_TCP_TW_RECYCLE
677 if (tw) {
678 /* Silly. Should hash-dance instead... */
679 local_bh_disable();
680 tcp_tw_deschedule(tw);
681 tcp_timewait_kill(tw);
682 local_bh_enable();
684 tcp_tw_put(tw);
686 #endif
687 return 0;
689 not_unique:
690 write_unlock_bh(&head->lock);
691 return -EADDRNOTAVAIL;
694 /* Hash SYN-SENT socket to established hash table after
695 * checking that it is unique. Note, that without kernel lock
696 * we MUST make these two operations atomically.
698 * Optimization: if it is bound and tcp_bind_bucket has the only
699 * owner (us), we need not to scan established bucket.
702 int tcp_v4_hash_connecting(struct sock *sk)
704 unsigned short snum = sk->num;
705 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
706 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
708 spin_lock_bh(&head->lock);
709 if (tb->owners == sk && sk->bind_next == NULL) {
710 __tcp_v4_hash(sk);
711 spin_unlock_bh(&head->lock);
712 return 0;
713 } else {
714 spin_unlock_bh(&head->lock);
716 /* No definite answer... Walk to established hash table */
717 return tcp_v4_check_established(sk);
721 /* This will initiate an outgoing connection. */
722 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
724 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
725 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
726 struct sk_buff *buff;
727 struct rtable *rt;
728 u32 daddr, nexthop;
729 int tmp;
730 int err;
732 if (sk->state != TCP_CLOSE)
733 return(-EISCONN);
735 if (addr_len < sizeof(struct sockaddr_in))
736 return(-EINVAL);
738 if (usin->sin_family != AF_INET)
739 return(-EAFNOSUPPORT);
741 nexthop = daddr = usin->sin_addr.s_addr;
742 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
743 if (daddr == 0)
744 return -EINVAL;
745 nexthop = sk->protinfo.af_inet.opt->faddr;
748 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
749 RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
750 if (tmp < 0)
751 return tmp;
753 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
754 ip_rt_put(rt);
755 return -ENETUNREACH;
758 __sk_dst_set(sk, &rt->u.dst);
760 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
761 daddr = rt->rt_dst;
763 err = -ENOBUFS;
764 buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
765 0, GFP_KERNEL);
767 if (buff == NULL)
768 goto failure;
770 if (!sk->saddr)
771 sk->saddr = rt->rt_src;
772 sk->rcv_saddr = sk->saddr;
774 if (!sk->num) {
775 if (sk->prot->get_port(sk, 0)
776 #ifdef CONFIG_TCP_TW_RECYCLE
777 && (!sysctl_tcp_tw_recycle ||
778 tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
779 #endif
781 kfree_skb(buff);
782 err = -EAGAIN;
783 goto failure;
785 sk->sport = htons(sk->num);
787 #ifdef CONFIG_TCP_TW_RECYCLE
788 else if (tp->ts_recent_stamp && sk->daddr != daddr) {
789 /* Reset inherited state */
790 tp->ts_recent = 0;
791 tp->ts_recent_stamp = 0;
792 tp->write_seq = 0;
794 #endif
796 sk->dport = usin->sin_port;
797 sk->daddr = daddr;
799 if (!tp->write_seq)
800 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
801 sk->sport, usin->sin_port);
803 tp->ext_header_len = 0;
804 if (sk->protinfo.af_inet.opt)
805 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
807 tp->mss_clamp = 536;
809 err = tcp_connect(sk, buff);
810 if (err == 0)
811 return 0;
813 failure:
814 __sk_dst_reset(sk);
815 sk->dport = 0;
816 return err;
819 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
821 int retval = -EINVAL;
823 lock_sock(sk);
825 /* Do sanity checking for sendmsg/sendto/send. */
826 if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
827 goto out;
828 if (msg->msg_name) {
829 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
831 if (msg->msg_namelen < sizeof(*addr))
832 goto out;
833 if (addr->sin_family && addr->sin_family != AF_INET)
834 goto out;
835 retval = -ENOTCONN;
836 if(sk->state == TCP_CLOSE)
837 goto out;
838 retval = -EISCONN;
839 if (addr->sin_port != sk->dport)
840 goto out;
841 if (addr->sin_addr.s_addr != sk->daddr)
842 goto out;
844 retval = tcp_do_sendmsg(sk, msg);
846 out:
847 release_sock(sk);
848 return retval;
853 * Do a linear search in the socket open_request list.
854 * This should be replaced with a global hash table.
856 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
857 struct iphdr *iph,
858 struct tcphdr *th,
859 struct open_request **prevp)
861 struct open_request *req, *prev;
862 __u16 rport = th->source;
864 /* assumption: the socket is not in use.
865 * as we checked the user count on tcp_rcv and we're
866 * running from a soft interrupt.
868 prev = (struct open_request *) (&tp->syn_wait_queue);
869 for (req = prev->dl_next; req; req = req->dl_next) {
870 if (req->af.v4_req.rmt_addr == iph->saddr &&
871 req->af.v4_req.loc_addr == iph->daddr &&
872 req->rmt_port == rport &&
873 TCP_INET_FAMILY(req->class->family)) {
874 if (req->sk) {
875 /* Weird case: connection was established
876 and then killed by RST before user accepted
877 it. This connection is dead, but we cannot
878 kill openreq to avoid blocking in accept().
880 accept() will collect this garbage,
881 but such reqs must be ignored, when talking
882 to network.
884 bh_lock_sock(req->sk);
885 BUG_TRAP(req->sk->lock.users==0);
886 if (req->sk->state == TCP_CLOSE) {
887 bh_unlock_sock(req->sk);
888 prev = req;
889 continue;
892 *prevp = prev;
893 return req;
895 prev = req;
897 return NULL;
902 * This routine does path mtu discovery as defined in RFC1191.
904 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
906 struct dst_entry *dst;
907 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
909 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
910 * send out by Linux are always <576bytes so they should go through
911 * unfragmented).
913 if (sk->state == TCP_LISTEN)
914 return;
916 /* We don't check in the destentry if pmtu discovery is forbidden
917 * on this route. We just assume that no packet_to_big packets
918 * are send back when pmtu discovery is not active.
919 * There is a small race when the user changes this flag in the
920 * route, but I think that's acceptable.
922 if ((dst = __sk_dst_check(sk, 0)) == NULL)
923 return;
925 ip_rt_update_pmtu(dst, mtu);
927 /* Something is about to be wrong... Remember soft error
928 * for the case, if this connection will not able to recover.
930 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
931 sk->err_soft = EMSGSIZE;
933 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
934 tp->pmtu_cookie > dst->pmtu) {
935 tcp_sync_mss(sk, dst->pmtu);
937 /* Resend the TCP packet because it's
938 * clear that the old packet has been
939 * dropped. This is the new "fast" path mtu
940 * discovery.
942 tcp_simple_retransmit(sk);
943 } /* else let the usual retransmit timer handle it */
947 * This routine is called by the ICMP module when it gets some
948 * sort of error condition. If err < 0 then the socket should
949 * be closed and the error returned to the user. If err > 0
950 * it's just the icmp type << 8 | icmp code. After adjustment
951 * header points to the first 8 bytes of the tcp header. We need
952 * to find the appropriate port.
954 * The locking strategy used here is very "optimistic". When
955 * someone else accesses the socket the ICMP is just dropped
956 * and for some paths there is no check at all.
957 * A more general error queue to queue errors for later handling
958 * is probably better.
962 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
964 struct iphdr *iph = (struct iphdr*)dp;
965 struct tcphdr *th;
966 struct tcp_opt *tp;
967 int type = skb->h.icmph->type;
968 int code = skb->h.icmph->code;
969 #if ICMP_MIN_LENGTH < 14
970 int no_flags = 0;
971 #else
972 #define no_flags 0
973 #endif
974 struct sock *sk;
975 __u32 seq;
976 int err;
978 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
979 icmp_statistics.IcmpInErrors++;
980 return;
982 #if ICMP_MIN_LENGTH < 14
983 if (len < (iph->ihl << 2) + 14)
984 no_flags = 1;
985 #endif
987 th = (struct tcphdr*)(dp+(iph->ihl<<2));
989 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
990 if (sk == NULL) {
991 icmp_statistics.IcmpInErrors++;
992 return;
994 if (sk->state == TCP_TIME_WAIT) {
995 tcp_tw_put((struct tcp_tw_bucket*)sk);
996 return;
999 bh_lock_sock(sk);
1000 /* If too many ICMPs get dropped on busy
1001 * servers this needs to be solved differently.
1003 if (sk->lock.users != 0)
1004 net_statistics.LockDroppedIcmps++;
1006 tp = &sk->tp_pinfo.af_tcp;
1007 seq = ntohl(th->seq);
1008 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1009 net_statistics.OutOfWindowIcmps++;
1010 goto out;
1013 switch (type) {
1014 case ICMP_SOURCE_QUENCH:
1015 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
1016 if (sk->lock.users == 0) {
1017 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
1018 tp->snd_cwnd = tp->snd_ssthresh;
1019 tp->snd_cwnd_cnt = 0;
1020 tp->high_seq = tp->snd_nxt;
1022 #endif
1023 goto out;
1024 case ICMP_PARAMETERPROB:
1025 err = EPROTO;
1026 break;
1027 case ICMP_DEST_UNREACH:
1028 if (code > NR_ICMP_UNREACH)
1029 goto out;
1031 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1032 if (sk->lock.users == 0)
1033 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
1034 goto out;
1037 err = icmp_err_convert[code].errno;
1038 break;
1039 case ICMP_TIME_EXCEEDED:
1040 err = EHOSTUNREACH;
1041 break;
1042 default:
1043 goto out;
1046 switch (sk->state) {
1047 struct open_request *req, *prev;
1048 case TCP_LISTEN:
1049 if (sk->lock.users != 0)
1050 goto out;
1052 /* The final ACK of the handshake should be already
1053 * handled in the new socket context, not here.
1054 * Strictly speaking - an ICMP error for the final
1055 * ACK should set the opening flag, but that is too
1056 * complicated right now.
1058 if (!no_flags && !th->syn && !th->ack)
1059 goto out;
1061 req = tcp_v4_search_req(tp, iph, th, &prev);
1062 if (!req)
1063 goto out;
1065 if (req->sk) {
1066 struct sock *nsk = req->sk;
1069 * Already in ESTABLISHED and a big socket is created,
1070 * set error code there.
1071 * The error will _not_ be reported in the accept(),
1072 * but only with the next operation on the socket after
1073 * accept.
1075 sock_hold(nsk);
1076 bh_unlock_sock(sk);
1077 sock_put(sk);
1078 sk = nsk;
1080 BUG_TRAP(sk->lock.users == 0);
1081 tp = &sk->tp_pinfo.af_tcp;
1082 if (!between(seq, tp->snd_una, tp->snd_nxt)) {
1083 net_statistics.OutOfWindowIcmps++;
1084 goto out;
1086 } else {
1087 if (seq != req->snt_isn) {
1088 net_statistics.OutOfWindowIcmps++;
1089 goto out;
1093 * Still in SYN_RECV, just remove it silently.
1094 * There is no good way to pass the error to the newly
1095 * created socket, and POSIX does not want network
1096 * errors returned from accept().
1098 tp->syn_backlog--;
1099 tcp_synq_unlink(tp, req, prev);
1100 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1101 req->class->destructor(req);
1102 tcp_openreq_free(req);
1103 goto out;
1105 break;
1106 case TCP_SYN_SENT:
1107 case TCP_SYN_RECV: /* Cannot happen.
1108 It can f.e. if SYNs crossed.
1110 if (!no_flags && !th->syn)
1111 goto out;
1112 if (sk->lock.users == 0) {
1113 tcp_statistics.TcpAttemptFails++;
1114 sk->err = err;
1115 /* Wake people up to see the error (see connect in sock.c) */
1116 sk->error_report(sk);
1118 tcp_set_state(sk, TCP_CLOSE);
1119 tcp_done(sk);
1120 } else {
1121 sk->err_soft = err;
1123 goto out;
1126 /* If we've already connected we will keep trying
1127 * until we time out, or the user gives up.
1129 * rfc1122 4.2.3.9 allows to consider as hard errors
1130 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1131 * but it is obsoleted by pmtu discovery).
1133 * Note, that in modern internet, where routing is unreliable
1134 * and in each dark corner broken firewalls sit, sending random
1135 * errors ordered by their masters even this two messages finally lose
1136 * their original sense (even Linux sends invalid PORT_UNREACHs)
1138 * Now we are in compliance with RFCs.
1139 * --ANK (980905)
1142 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1143 sk->err = err;
1144 sk->error_report(sk);
1145 } else { /* Only an error on timeout */
1146 sk->err_soft = err;
1149 out:
1150 bh_unlock_sock(sk);
1151 sock_put(sk);
1154 /* This routine computes an IPv4 TCP checksum. */
1155 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1156 struct sk_buff *skb)
1158 th->check = 0;
1159 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1160 csum_partial((char *)th, th->doff<<2, skb->csum));
1164 * This routine will send an RST to the other tcp.
1166 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1167 * for reset.
1168 * Answer: if a packet caused RST, it is not for a socket
1169 * existing in our system, if it is matched to a socket,
1170 * it is just duplicate segment or bug in other side's TCP.
1171 * So that we build reply only basing on parameters
1172 * arrived with segment.
1173 * Exception: precedence violation. We do not implement it in any case.
1176 static void tcp_v4_send_reset(struct sk_buff *skb)
1178 struct tcphdr *th = skb->h.th;
1179 struct tcphdr rth;
1180 struct ip_reply_arg arg;
1182 /* Never send a reset in response to a reset. */
1183 if (th->rst)
1184 return;
1186 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1187 return;
1189 /* Swap the send and the receive. */
1190 memset(&rth, 0, sizeof(struct tcphdr));
1191 rth.dest = th->source;
1192 rth.source = th->dest;
1193 rth.doff = sizeof(struct tcphdr)/4;
1194 rth.rst = 1;
1196 if (th->ack) {
1197 rth.seq = th->ack_seq;
1198 } else {
1199 rth.ack = 1;
1200 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1201 + skb->len - (th->doff<<2));
1204 memset(&arg, 0, sizeof arg);
1205 arg.iov[0].iov_base = (unsigned char *)&rth;
1206 arg.iov[0].iov_len = sizeof rth;
1207 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1208 skb->nh.iph->saddr, /*XXX*/
1209 sizeof(struct tcphdr),
1210 IPPROTO_TCP,
1211 0);
1212 arg.n_iov = 1;
1213 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1215 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1217 tcp_statistics.TcpOutSegs++;
1218 tcp_statistics.TcpOutRsts++;
1221 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1222 outside socket context is ugly, certainly. What can I do?
1225 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1227 struct tcphdr *th = skb->h.th;
1228 struct {
1229 struct tcphdr th;
1230 u32 tsopt[3];
1231 } rep;
1232 struct ip_reply_arg arg;
1234 memset(&rep.th, 0, sizeof(struct tcphdr));
1235 memset(&arg, 0, sizeof arg);
1237 arg.iov[0].iov_base = (unsigned char *)&rep;
1238 arg.iov[0].iov_len = sizeof(rep.th);
1239 arg.n_iov = 1;
1240 if (ts) {
1241 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1242 (TCPOPT_NOP << 16) |
1243 (TCPOPT_TIMESTAMP << 8) |
1244 TCPOLEN_TIMESTAMP);
1245 rep.tsopt[1] = htonl(tcp_time_stamp);
1246 rep.tsopt[2] = htonl(ts);
1247 arg.iov[0].iov_len = sizeof(rep);
1250 /* Swap the send and the receive. */
1251 rep.th.dest = th->source;
1252 rep.th.source = th->dest;
1253 rep.th.doff = arg.iov[0].iov_len/4;
1254 rep.th.seq = htonl(seq);
1255 rep.th.ack_seq = htonl(ack);
1256 rep.th.ack = 1;
1257 rep.th.window = htons(win);
1259 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1260 skb->nh.iph->saddr, /*XXX*/
1261 arg.iov[0].iov_len,
1262 IPPROTO_TCP,
1264 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1266 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1268 tcp_statistics.TcpOutSegs++;
1271 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1273 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1275 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
1277 tcp_tw_put(tw);
1280 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1282 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
1286 * Send a SYN-ACK after having received an ACK.
1287 * This still operates on a open_request only, not on a big
1288 * socket.
1290 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1292 struct rtable *rt;
1293 struct ip_options *opt;
1294 struct sk_buff * skb;
1296 /* First, grab a route. */
1297 opt = req->af.v4_req.opt;
1298 if(ip_route_output(&rt, ((opt && opt->srr) ?
1299 opt->faddr :
1300 req->af.v4_req.rmt_addr),
1301 req->af.v4_req.loc_addr,
1302 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1303 sk->bound_dev_if)) {
1304 ip_statistics.IpOutNoRoutes++;
1305 return;
1307 if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1308 ip_rt_put(rt);
1309 ip_statistics.IpOutNoRoutes++;
1310 return;
1313 skb = tcp_make_synack(sk, &rt->u.dst, req);
1315 if (skb) {
1316 struct tcphdr *th = skb->h.th;
1318 th->check = tcp_v4_check(th, skb->len,
1319 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1320 csum_partial((char *)th, skb->len, skb->csum));
1322 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1323 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1325 ip_rt_put(rt);
1329 * IPv4 open_request destructor.
1331 static void tcp_v4_or_free(struct open_request *req)
1333 if(!req->sk && req->af.v4_req.opt)
1334 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1337 static inline void syn_flood_warning(struct sk_buff *skb)
1339 static unsigned long warntime;
1341 if (jiffies - warntime > HZ*60) {
1342 warntime = jiffies;
1343 printk(KERN_INFO
1344 "possible SYN flooding on port %d. Sending cookies.\n",
1345 ntohs(skb->h.th->dest));
1350 * Save and compile IPv4 options into the open_request if needed.
1352 static inline struct ip_options *
1353 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1355 struct ip_options *opt = &(IPCB(skb)->opt);
1356 struct ip_options *dopt = NULL;
1358 if (opt && opt->optlen) {
1359 int opt_size = optlength(opt);
1360 dopt = kmalloc(opt_size, GFP_ATOMIC);
1361 if (dopt) {
1362 if (ip_options_echo(dopt, skb)) {
1363 kfree_s(dopt, opt_size);
1364 dopt = NULL;
1368 return dopt;
1372 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374 * It would be better to replace it with a global counter for all sockets
1375 * but then some measure against one socket starving all other sockets
1376 * would be needed.
1378 int sysctl_max_syn_backlog = 128;
1380 struct or_calltable or_ipv4 = {
1381 PF_INET,
1382 tcp_v4_send_synack,
1383 tcp_v4_or_send_ack,
1384 tcp_v4_or_free,
1385 tcp_v4_send_reset
1388 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1389 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1393 struct tcp_opt tp;
1394 struct open_request *req;
1395 struct tcphdr *th = skb->h.th;
1396 __u32 saddr = skb->nh.iph->saddr;
1397 __u32 daddr = skb->nh.iph->daddr;
1398 __u32 isn = TCP_SKB_CB(skb)->when;
1399 #ifdef CONFIG_SYN_COOKIES
1400 int want_cookie = 0;
1401 #else
1402 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1403 #endif
1405 /* Never answer to SYNs send to broadcast or multicast */
1406 if (((struct rtable *)skb->dst)->rt_flags &
1407 (RTCF_BROADCAST|RTCF_MULTICAST))
1408 goto drop;
1410 /* XXX: Check against a global syn pool counter. */
1411 if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1412 #ifdef CONFIG_SYN_COOKIES
1413 if (sysctl_tcp_syncookies && !isn) {
1414 syn_flood_warning(skb);
1415 want_cookie = 1;
1416 } else
1417 #endif
1418 goto drop;
1419 } else {
1420 if (isn == 0)
1421 isn = tcp_v4_init_sequence(sk, skb);
1422 BACKLOG(sk)++;
1425 req = tcp_openreq_alloc();
1426 if (req == NULL) {
1427 goto dropbacklog;
1430 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1432 req->rcv_isn = TCP_SKB_CB(skb)->seq;
1433 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1435 tp.mss_clamp = 536;
1436 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1438 tcp_parse_options(NULL, th, &tp, want_cookie);
1440 req->mss = tp.mss_clamp;
1441 req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
1442 req->tstamp_ok = tp.tstamp_ok;
1443 req->sack_ok = tp.sack_ok;
1444 req->snd_wscale = tp.snd_wscale;
1445 req->wscale_ok = tp.wscale_ok;
1446 req->rmt_port = th->source;
1447 req->af.v4_req.loc_addr = daddr;
1448 req->af.v4_req.rmt_addr = saddr;
1450 /* Note that we ignore the isn passed from the TIME_WAIT
1451 * state here. That's the price we pay for cookies.
1453 * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
1454 * and should reject connection attempt, duplicates with random
1455 * sequence number can corrupt data. Right?
1456 * I disabled sending cookie to request matching to a timewait
1457 * bucket.
1459 if (want_cookie)
1460 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1462 req->snt_isn = isn;
1464 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1466 req->class = &or_ipv4;
1467 req->retrans = 0;
1468 req->sk = NULL;
1470 tcp_v4_send_synack(sk, req);
1472 if (want_cookie) {
1473 if (req->af.v4_req.opt)
1474 kfree(req->af.v4_req.opt);
1475 tcp_v4_or_free(req);
1476 tcp_openreq_free(req);
1477 } else {
1478 req->expires = jiffies + TCP_TIMEOUT_INIT;
1479 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1480 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1483 return 0;
1485 dropbacklog:
1486 if (!want_cookie)
1487 BACKLOG(sk)--;
1488 drop:
1489 tcp_statistics.TcpAttemptFails++;
1490 return 0;
1495 * The three way handshake has completed - we got a valid synack -
1496 * now create the new socket.
1498 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1499 struct open_request *req,
1500 struct dst_entry *dst)
1502 struct ip_options *opt = req->af.v4_req.opt;
1503 struct tcp_opt *newtp;
1504 struct sock *newsk;
1506 if (sk->ack_backlog > sk->max_ack_backlog)
1507 goto exit; /* head drop */
1508 if (dst == NULL) {
1509 struct rtable *rt;
1511 if (ip_route_output(&rt,
1512 opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1513 req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
1514 return NULL;
1515 dst = &rt->u.dst;
1518 newsk = tcp_create_openreq_child(sk, req, skb);
1519 if (!newsk)
1520 goto exit;
1522 sk->tp_pinfo.af_tcp.syn_backlog--;
1523 sk->ack_backlog++;
1525 newsk->dst_cache = dst;
1527 newtp = &(newsk->tp_pinfo.af_tcp);
1528 newsk->daddr = req->af.v4_req.rmt_addr;
1529 newsk->saddr = req->af.v4_req.loc_addr;
1530 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1531 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1532 newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
1533 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1534 newtp->ext_header_len = 0;
1535 if (newsk->protinfo.af_inet.opt)
1536 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1538 tcp_sync_mss(newsk, dst->pmtu);
1539 tcp_initialize_rcv_mss(newsk);
1541 if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
1542 newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
1543 if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
1544 newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
1546 bh_lock_sock(newsk);
1548 __tcp_v4_hash(newsk);
1549 __tcp_inherit_port(sk, newsk);
1551 return newsk;
1553 exit:
1554 dst_release(dst);
1555 return NULL;
1559 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1561 struct open_request *req, *prev;
1562 struct tcphdr *th = skb->h.th;
1563 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1565 /* Find possible connection requests. */
1566 req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1567 if (req)
1568 return tcp_check_req(sk, skb, req, prev);
1570 #ifdef CONFIG_SYN_COOKIES
1571 if (!th->rst && (th->syn || th->ack))
1572 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1573 #endif
1574 return sk;
1577 static int tcp_csum_verify(struct sk_buff *skb)
1579 switch (skb->ip_summed) {
1580 case CHECKSUM_NONE:
1581 skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
1582 case CHECKSUM_HW:
1583 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1584 NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1585 "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1586 "len=%d/%d\n",
1587 NIPQUAD(skb->nh.iph->saddr),
1588 ntohs(skb->h.th->source),
1589 NIPQUAD(skb->nh.iph->daddr),
1590 ntohs(skb->h.th->dest),
1591 skb->len,
1592 ntohs(skb->nh.iph->tot_len)));
1593 return 1;
1595 skb->ip_summed = CHECKSUM_UNNECESSARY;
1596 default:
1597 /* CHECKSUM_UNNECESSARY */
1599 return 0;
1603 /* The socket must have it's spinlock held when we get
1604 * here.
1606 * We have a potential double-lock case here, so even when
1607 * doing backlog processing we use the BH locking scheme.
1608 * This is because we cannot sleep with the original spinlock
1609 * held.
1611 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1613 #ifdef CONFIG_FILTER
1614 struct sk_filter *filter = sk->filter;
1615 if (filter && sk_filter(skb, filter))
1616 goto discard;
1617 #endif /* CONFIG_FILTER */
1620 * This doesn't check if the socket has enough room for the packet.
1621 * Either process the packet _without_ queueing it and then free it,
1622 * or do the check later.
1624 skb_set_owner_r(skb, sk);
1626 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1627 /* Ready to move deeper ... */
1628 if (tcp_csum_verify(skb))
1629 goto csum_err;
1630 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1631 goto reset;
1632 return 0;
1635 if (tcp_csum_verify(skb))
1636 goto csum_err;
1638 if (sk->state == TCP_LISTEN) {
1639 struct sock *nsk;
1641 nsk = tcp_v4_hnd_req(sk, skb);
1642 if (!nsk)
1643 goto discard;
1646 * Queue it on the new socket if the new socket is active,
1647 * otherwise we just shortcircuit this and continue with
1648 * the new socket..
1650 if (nsk != sk) {
1651 int ret;
1652 int state = nsk->state;
1654 skb_orphan(skb);
1656 BUG_TRAP(nsk->lock.users == 0);
1657 skb_set_owner_r(skb, nsk);
1658 ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
1660 /* Wakeup parent, send SIGIO, if this packet changed
1661 socket state from SYN-RECV.
1663 It still looks ugly, however it is much better
1664 than miracleous double wakeup in syn_recv_sock()
1665 and tcp_rcv_state_process().
1667 if (state == TCP_SYN_RECV && nsk->state != state)
1668 sk->data_ready(sk, 0);
1670 bh_unlock_sock(nsk);
1671 if (ret)
1672 goto reset;
1673 return 0;
1677 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1678 goto reset;
1679 return 0;
1681 reset:
1682 tcp_v4_send_reset(skb);
1683 discard:
1684 kfree_skb(skb);
1685 /* Be careful here. If this function gets more complicated and
1686 * gcc suffers from register pressure on the x86, sk (in %ebx)
1687 * might be destroyed here. This current version compiles correctly,
1688 * but you have been warned.
1690 return 0;
1692 csum_err:
1693 tcp_statistics.TcpInErrs++;
1694 goto discard;
1698 * From tcp_input.c
1701 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1703 struct tcphdr *th;
1704 struct sock *sk;
1705 int ret;
1707 if (skb->pkt_type!=PACKET_HOST)
1708 goto discard_it;
1710 th = skb->h.th;
1712 /* Pull up the IP header. */
1713 __skb_pull(skb, skb->h.raw - skb->data);
1715 /* Count it even if it's bad */
1716 tcp_statistics.TcpInSegs++;
1718 if (len < sizeof(struct tcphdr))
1719 goto bad_packet;
1721 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1722 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1723 len - th->doff*4);
1724 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1725 TCP_SKB_CB(skb)->when = 0;
1726 skb->used = 0;
1728 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1729 skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
1731 if (!sk)
1732 goto no_tcp_socket;
1734 process:
1735 if(!ipsec_sk_policy(sk,skb))
1736 goto discard_and_relse;
1738 if (sk->state == TCP_TIME_WAIT)
1739 goto do_time_wait;
1741 bh_lock_sock(sk);
1742 ret = 0;
1743 if (!sk->lock.users)
1744 ret = tcp_v4_do_rcv(sk, skb);
1745 else
1746 sk_add_backlog(sk, skb);
1747 bh_unlock_sock(sk);
1749 sock_put(sk);
1751 return ret;
1753 no_tcp_socket:
1754 if (tcp_csum_verify(skb)) {
1755 bad_packet:
1756 tcp_statistics.TcpInErrs++;
1757 } else {
1758 tcp_v4_send_reset(skb);
1761 discard_it:
1762 /* Discard frame. */
1763 kfree_skb(skb);
1764 return 0;
1766 discard_and_relse:
1767 sock_put(sk);
1768 goto discard_it;
1770 do_time_wait:
1771 if (tcp_csum_verify(skb)) {
1772 tcp_statistics.TcpInErrs++;
1773 goto discard_and_relse;
1775 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1776 skb, th, skb->len)) {
1777 case TCP_TW_SYN:
1779 struct sock *sk2;
1781 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
1782 if (sk2 != NULL) {
1783 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1784 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1785 tcp_tw_put((struct tcp_tw_bucket *)sk);
1786 sk = sk2;
1787 goto process;
1789 /* Fall through to ACK */
1791 case TCP_TW_ACK:
1792 tcp_v4_timewait_ack(sk, skb);
1793 break;
1794 case TCP_TW_RST:
1795 goto no_tcp_socket;
1796 case TCP_TW_SUCCESS:
1798 goto discard_it;
1801 static void __tcp_v4_rehash(struct sock *sk)
1803 struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
1804 struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
1805 struct sock **skp = &head->chain;
1807 write_lock_bh(&oldhead->lock);
1808 if(sk->pprev) {
1809 if(sk->next)
1810 sk->next->pprev = sk->pprev;
1811 *sk->pprev = sk->next;
1812 sk->pprev = NULL;
1814 write_unlock(&oldhead->lock);
1815 write_lock(&head->lock);
1816 if((sk->next = *skp) != NULL)
1817 (*skp)->pprev = &sk->next;
1818 *skp = sk;
1819 sk->pprev = skp;
1820 write_unlock_bh(&head->lock);
1823 int tcp_v4_rebuild_header(struct sock *sk)
1825 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1826 __u32 new_saddr;
1827 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1829 if(rt == NULL)
1830 return 0;
1832 /* Force route checking if want_rewrite.
1833 * The idea is good, the implementation is disguisting.
1834 * Well, if I made bind on this socket, you cannot randomly ovewrite
1835 * its source address. --ANK
1837 if (want_rewrite) {
1838 int tmp;
1839 struct rtable *new_rt;
1840 __u32 old_saddr = rt->rt_src;
1842 /* Query new route using another rt buffer */
1843 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1844 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1845 sk->bound_dev_if);
1847 /* Only useful if different source addrs */
1848 if (tmp == 0) {
1850 * Only useful if different source addrs
1852 if (new_rt->rt_src != old_saddr ) {
1853 __sk_dst_set(sk, &new_rt->u.dst);
1854 rt = new_rt;
1855 goto do_rewrite;
1857 dst_release(&new_rt->u.dst);
1860 if (rt->u.dst.obsolete) {
1861 int err;
1862 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1863 if (err) {
1864 sk->err_soft=-err;
1865 sk->error_report(sk);
1866 return -1;
1868 __sk_dst_set(sk, &rt->u.dst);
1871 return 0;
1873 do_rewrite:
1874 new_saddr = rt->rt_src;
1876 /* Ouch!, this should not happen. */
1877 if (!sk->saddr || !sk->rcv_saddr) {
1878 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1879 "saddr=%08lX rcv_saddr=%08lX\n",
1880 ntohl(sk->saddr),
1881 ntohl(sk->rcv_saddr));
1882 return 0;
1885 if (new_saddr != sk->saddr) {
1886 if (sysctl_ip_dynaddr > 1) {
1887 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1888 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1889 NIPQUAD(sk->saddr),
1890 NIPQUAD(new_saddr));
1893 sk->saddr = new_saddr;
1894 sk->rcv_saddr = new_saddr;
1896 /* XXX The only one ugly spot where we need to
1897 * XXX really change the sockets identity after
1898 * XXX it has entered the hashes. -DaveM
1900 * Besides that, it does not check for connetion
1901 * uniqueness. Wait for troubles.
1903 __tcp_v4_rehash(sk);
1906 return 0;
1909 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1911 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1913 sin->sin_family = AF_INET;
1914 sin->sin_addr.s_addr = sk->daddr;
1915 sin->sin_port = sk->dport;
1918 struct tcp_func ipv4_specific = {
1919 ip_queue_xmit,
1920 tcp_v4_send_check,
1921 tcp_v4_rebuild_header,
1922 tcp_v4_conn_request,
1923 tcp_v4_syn_recv_sock,
1924 tcp_v4_hash_connecting,
1925 sizeof(struct iphdr),
1927 ip_setsockopt,
1928 ip_getsockopt,
1929 v4_addr2sockaddr,
1930 sizeof(struct sockaddr_in)
1933 /* NOTE: A lot of things set to zero explicitly by call to
1934 * sk_alloc() so need not be done here.
1936 static int tcp_v4_init_sock(struct sock *sk)
1938 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1940 skb_queue_head_init(&tp->out_of_order_queue);
1941 tcp_init_xmit_timers(sk);
1943 tp->rto = TCP_TIMEOUT_INIT;
1944 tp->mdev = TCP_TIMEOUT_INIT;
1946 /* So many TCP implementations out there (incorrectly) count the
1947 * initial SYN frame in their delayed-ACK and congestion control
1948 * algorithms that we must have the following bandaid to talk
1949 * efficiently to them. -DaveM
1951 tp->snd_cwnd = 2;
1953 /* See draft-stevens-tcpca-spec-01 for discussion of the
1954 * initialization of these values.
1956 tp->snd_cwnd_cnt = 0;
1957 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1958 tp->snd_cwnd_clamp = ~0;
1959 tp->mss_cache = 536;
1961 sk->state = TCP_CLOSE;
1962 sk->max_ack_backlog = SOMAXCONN;
1964 sk->write_space = tcp_write_space;
1966 /* Init SYN queue. */
1967 tcp_synq_init(tp);
1969 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1971 return 0;
1974 static int tcp_v4_destroy_sock(struct sock *sk)
1976 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1978 tcp_clear_xmit_timers(sk);
1980 /* Cleanup up the write buffer. */
1981 __skb_queue_purge(&sk->write_queue);
1983 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1984 __skb_queue_purge(&tp->out_of_order_queue);
1986 /* Clean up a referenced TCP bind bucket, this only happens if a
1987 * port is allocated for a socket, but it never fully connects.
1989 if(sk->prev != NULL)
1990 tcp_put_port(sk);
1992 return 0;
1995 /* Proc filesystem TCP sock list dumping. */
1996 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
1998 sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
1999 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2001 (long unsigned int)req->af.v4_req.loc_addr,
2002 ntohs(sk->sport),
2003 (long unsigned int)req->af.v4_req.rmt_addr,
2004 ntohs(req->rmt_port),
2005 TCP_SYN_RECV,
2006 0,0, /* could print option size, but that is af dependent. */
2007 1, /* timers active (only the expire timer) */
2008 (unsigned long)(req->expires - jiffies),
2009 req->retrans,
2010 sk->socket ? sk->socket->inode->i_uid : 0,
2011 0, /* non standard timer */
2012 0, /* open_requests have no inode */
2013 atomic_read(&sk->refcnt),
2018 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2020 unsigned int dest, src;
2021 __u16 destp, srcp;
2022 int timer_active, timer_active1, timer_active2;
2023 unsigned long timer_expires;
2024 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2026 dest = sp->daddr;
2027 src = sp->rcv_saddr;
2028 destp = ntohs(sp->dport);
2029 srcp = ntohs(sp->sport);
2030 timer_active1 = tp->retransmit_timer.prev != NULL;
2031 timer_active2 = sp->timer.prev != NULL;
2032 timer_active = 0;
2033 timer_expires = (unsigned) -1;
2034 if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
2035 timer_active = 1;
2036 timer_expires = tp->retransmit_timer.expires;
2038 if (timer_active2 && sp->timer.expires < timer_expires) {
2039 timer_active = 2;
2040 timer_expires = sp->timer.expires;
2042 if(timer_active == 0)
2043 timer_expires = jiffies;
2045 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2046 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
2047 i, src, srcp, dest, destp, sp->state,
2048 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2049 timer_active, timer_expires-jiffies,
2050 tp->retransmits,
2051 sp->socket ? sp->socket->inode->i_uid : 0,
2053 sp->socket ? sp->socket->inode->i_ino : 0,
2054 atomic_read(&sp->refcnt), sp);
2057 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2059 unsigned int dest, src;
2060 __u16 destp, srcp;
2061 int slot_dist;
2063 dest = tw->daddr;
2064 src = tw->rcv_saddr;
2065 destp = ntohs(tw->dport);
2066 srcp = ntohs(tw->sport);
2068 slot_dist = tw->death_slot;
2069 if(slot_dist > tcp_tw_death_row_slot)
2070 slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
2071 else
2072 slot_dist = tcp_tw_death_row_slot - slot_dist;
2074 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2075 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2076 i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
2077 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
2078 atomic_read(&tw->refcnt), tw);
2081 int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
2083 int len = 0, num = 0, i;
2084 off_t begin, pos = 0;
2085 char tmpbuf[129];
2087 if (offset < 128)
2088 len += sprintf(buffer, "%-127s\n",
2089 " sl local_address rem_address st tx_queue "
2090 "rx_queue tr tm->when retrnsmt uid timeout inode");
2092 pos = 128;
2094 /* First, walk listening socket table. */
2095 tcp_listen_lock();
2096 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2097 struct sock *sk = tcp_listening_hash[i];
2099 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2100 struct open_request *req;
2101 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2103 if (!TCP_INET_FAMILY(sk->family))
2104 goto skip_listen;
2106 pos += 128;
2107 if (pos >= offset) {
2108 get_tcp_sock(sk, tmpbuf, num);
2109 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2110 if (len >= length) {
2111 tcp_listen_unlock();
2112 goto out_no_bh;
2116 skip_listen:
2117 lock_sock(sk);
2118 for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
2119 if (req->sk)
2120 continue;
2121 if (!TCP_INET_FAMILY(req->class->family))
2122 continue;
2124 pos += 128;
2125 if (pos < offset)
2126 continue;
2127 get_openreq(sk, req, tmpbuf, num);
2128 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2129 if(len >= length) {
2130 tcp_listen_unlock();
2131 release_sock(sk);
2132 goto out_no_bh;
2135 release_sock(sk);
2138 tcp_listen_unlock();
2140 local_bh_disable();
2142 /* Next, walk established hash chain. */
2143 for (i = 0; i < tcp_ehash_size; i++) {
2144 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2145 struct sock *sk;
2146 struct tcp_tw_bucket *tw;
2148 read_lock(&head->lock);
2149 for(sk = head->chain; sk; sk = sk->next, num++) {
2150 if (!TCP_INET_FAMILY(sk->family))
2151 continue;
2152 pos += 128;
2153 if (pos < offset)
2154 continue;
2155 get_tcp_sock(sk, tmpbuf, num);
2156 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2157 if(len >= length) {
2158 read_unlock(&head->lock);
2159 goto out;
2162 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2163 tw != NULL;
2164 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2165 if (!TCP_INET_FAMILY(tw->family))
2166 continue;
2167 pos += 128;
2168 if (pos < offset)
2169 continue;
2170 get_timewait_sock(tw, tmpbuf, num);
2171 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2172 if(len >= length) {
2173 read_unlock(&head->lock);
2174 goto out;
2177 read_unlock(&head->lock);
2180 out:
2181 local_bh_enable();
2182 out_no_bh:
2184 begin = len - (pos - offset);
2185 *start = buffer + begin;
2186 len -= begin;
2187 if(len > length)
2188 len = length;
2189 if (len < 0)
2190 len = 0;
2191 return len;
2194 struct proto tcp_prot = {
2195 tcp_close, /* close */
2196 tcp_v4_connect, /* connect */
2197 tcp_disconnect, /* disconnect */
2198 tcp_accept, /* accept */
2199 NULL, /* retransmit */
2200 tcp_write_wakeup, /* write_wakeup */
2201 tcp_read_wakeup, /* read_wakeup */
2202 tcp_poll, /* poll */
2203 tcp_ioctl, /* ioctl */
2204 tcp_v4_init_sock, /* init */
2205 tcp_v4_destroy_sock, /* destroy */
2206 tcp_shutdown, /* shutdown */
2207 tcp_setsockopt, /* setsockopt */
2208 tcp_getsockopt, /* getsockopt */
2209 tcp_v4_sendmsg, /* sendmsg */
2210 tcp_recvmsg, /* recvmsg */
2211 NULL, /* bind */
2212 tcp_v4_do_rcv, /* backlog_rcv */
2213 tcp_v4_hash, /* hash */
2214 tcp_unhash, /* unhash */
2215 tcp_v4_get_port, /* get_port */
2216 128, /* max_header */
2217 0, /* retransmits */
2218 "TCP", /* name */
2219 0, /* inuse */
2220 0 /* highestinuse */
2225 void __init tcp_v4_init(struct net_proto_family *ops)
2227 int err;
2229 tcp_inode.i_mode = S_IFSOCK;
2230 tcp_inode.i_sock = 1;
2231 tcp_inode.i_uid = 0;
2232 tcp_inode.i_gid = 0;
2233 init_waitqueue_head(&tcp_inode.i_wait);
2234 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2236 tcp_socket->inode = &tcp_inode;
2237 tcp_socket->state = SS_UNCONNECTED;
2238 tcp_socket->type=SOCK_RAW;
2240 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2241 panic("Failed to create the TCP control socket.\n");
2242 tcp_socket->sk->allocation=GFP_ATOMIC;
2243 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2245 /* Unhash it so that IP input processing does not even
2246 * see it, we do not wish this socket to see incoming
2247 * packets.
2249 tcp_socket->sk->prot->unhash(tcp_socket->sk);