Import 2.3.41pre2
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blob7420e268f406bf5d7d18cfe51f969833913b8b61
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
56 #include <net/icmp.h>
57 #include <net/tcp.h>
58 #include <net/ipv6.h>
59 #include <net/inet_common.h>
61 #include <linux/inet.h>
62 #include <linux/stddef.h>
63 #include <linux/ipsec.h>
65 extern int sysctl_ip_dynaddr;
67 /* Check TCP sequence numbers in ICMP packets. */
68 #define ICMP_MIN_LENGTH 8
70 /* Socket used for sending RSTs */
71 struct inode tcp_inode;
72 struct socket *tcp_socket=&tcp_inode.u.socket_i;
74 static void tcp_v4_send_reset(struct sk_buff *skb);
76 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
77 struct sk_buff *skb);
79 /* This is for sockets with full identity only. Sockets here will always
80 * be without wildcards and will have the following invariant:
81 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
83 * First half of the table is for sockets not in TIME_WAIT, second half
84 * is for TIME_WAIT sockets only.
86 struct tcp_ehash_bucket *tcp_ehash = NULL;
88 /* Ok, let's try this, I give up, we do need a local binding
89 * TCP hash as well as the others for fast bind/connect.
91 struct tcp_bind_hashbucket *tcp_bhash = NULL;
93 int tcp_bhash_size = 0;
94 int tcp_ehash_size = 0;
96 /* All sockets in TCP_LISTEN state will be in here. This is the only table
97 * where wildcard'd TCP sockets can exist. Hash function here is just local
98 * port number.
100 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
101 char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
102 (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
103 (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
105 rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
106 atomic_t tcp_lhash_users = ATOMIC_INIT(0);
107 DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
109 spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
112 * This array holds the first and last local port number.
113 * For high-usage systems, use sysctl to change this to
114 * 32768-61000
116 int sysctl_local_port_range[2] = { 1024, 4999 };
117 int tcp_port_rover = (1024 - 1);
119 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
120 __u32 faddr, __u16 fport)
122 int h = ((laddr ^ lport) ^ (faddr ^ fport));
123 h ^= h>>16;
124 h ^= h>>8;
125 return h & (tcp_ehash_size - 1);
128 static __inline__ int tcp_sk_hashfn(struct sock *sk)
130 __u32 laddr = sk->rcv_saddr;
131 __u16 lport = sk->num;
132 __u32 faddr = sk->daddr;
133 __u16 fport = sk->dport;
135 return tcp_hashfn(laddr, lport, faddr, fport);
138 /* Allocate and initialize a new TCP local port bind bucket.
139 * The bindhash mutex for snum's hash chain must be held here.
141 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
142 unsigned short snum)
144 struct tcp_bind_bucket *tb;
146 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
147 if(tb != NULL) {
148 tb->port = snum;
149 tb->fastreuse = 0;
150 tb->owners = NULL;
151 if((tb->next = head->chain) != NULL)
152 tb->next->pprev = &tb->next;
153 head->chain = tb;
154 tb->pprev = &head->chain;
156 return tb;
159 /* Caller must disable local BH processing. */
160 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
162 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
163 struct tcp_bind_bucket *tb;
165 spin_lock(&head->lock);
166 tb = (struct tcp_bind_bucket *)sk->prev;
167 if ((child->bind_next = tb->owners) != NULL)
168 tb->owners->bind_pprev = &child->bind_next;
169 tb->owners = child;
170 child->bind_pprev = &tb->owners;
171 child->prev = (struct sock *) tb;
172 spin_unlock(&head->lock);
175 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
177 local_bh_disable();
178 __tcp_inherit_port(sk, child);
179 local_bh_enable();
182 /* Obtain a reference to a local port for the given sock,
183 * if snum is zero it means select any available local port.
185 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
187 struct tcp_bind_hashbucket *head;
188 struct tcp_bind_bucket *tb;
189 int ret;
191 local_bh_disable();
192 if (snum == 0) {
193 int low = sysctl_local_port_range[0];
194 int high = sysctl_local_port_range[1];
195 int remaining = (high - low) + 1;
196 int rover;
198 spin_lock(&tcp_portalloc_lock);
199 rover = tcp_port_rover;
200 do { rover++;
201 if ((rover < low) || (rover > high))
202 rover = low;
203 head = &tcp_bhash[tcp_bhashfn(rover)];
204 spin_lock(&head->lock);
205 for (tb = head->chain; tb; tb = tb->next)
206 if (tb->port == rover)
207 goto next;
208 break;
209 next:
210 spin_unlock(&head->lock);
211 } while (--remaining > 0);
212 tcp_port_rover = rover;
213 spin_unlock(&tcp_portalloc_lock);
215 /* Exhausted local port range during search? */
216 ret = 1;
217 if (remaining <= 0)
218 goto fail;
220 /* OK, here is the one we will use. HEAD is
221 * non-NULL and we hold it's mutex.
223 snum = rover;
224 tb = NULL;
225 } else {
226 head = &tcp_bhash[tcp_bhashfn(snum)];
227 spin_lock(&head->lock);
228 for (tb = head->chain; tb != NULL; tb = tb->next)
229 if (tb->port == snum)
230 break;
232 if (tb != NULL && tb->owners != NULL) {
233 if (tb->fastreuse != 0 && sk->reuse != 0) {
234 goto success;
235 } else {
236 struct sock *sk2 = tb->owners;
237 int sk_reuse = sk->reuse;
239 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
240 if (sk->bound_dev_if == sk2->bound_dev_if) {
241 if (!sk_reuse ||
242 !sk2->reuse ||
243 sk2->state == TCP_LISTEN) {
244 if (!sk2->rcv_saddr ||
245 !sk->rcv_saddr ||
246 (sk2->rcv_saddr == sk->rcv_saddr))
247 break;
251 /* If we found a conflict, fail. */
252 ret = 1;
253 if (sk2 != NULL)
254 goto fail_unlock;
257 ret = 1;
258 if (tb == NULL &&
259 (tb = tcp_bucket_create(head, snum)) == NULL)
260 goto fail_unlock;
261 if (tb->owners == NULL) {
262 if (sk->reuse && sk->state != TCP_LISTEN)
263 tb->fastreuse = 1;
264 else
265 tb->fastreuse = 0;
266 } else if (tb->fastreuse &&
267 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
268 tb->fastreuse = 0;
269 success:
270 sk->num = snum;
271 if ((sk->bind_next = tb->owners) != NULL)
272 tb->owners->bind_pprev = &sk->bind_next;
273 tb->owners = sk;
274 sk->bind_pprev = &tb->owners;
275 sk->prev = (struct sock *) tb;
276 ret = 0;
278 fail_unlock:
279 spin_unlock(&head->lock);
280 fail:
281 local_bh_enable();
282 return ret;
285 /* Get rid of any references to a local port held by the
286 * given sock.
288 __inline__ void __tcp_put_port(struct sock *sk)
290 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
291 struct tcp_bind_bucket *tb;
293 spin_lock(&head->lock);
294 tb = (struct tcp_bind_bucket *) sk->prev;
295 if (sk->bind_next)
296 sk->bind_next->bind_pprev = sk->bind_pprev;
297 *(sk->bind_pprev) = sk->bind_next;
298 sk->prev = NULL;
299 if (tb->owners == NULL) {
300 if (tb->next)
301 tb->next->pprev = tb->pprev;
302 *(tb->pprev) = tb->next;
303 kmem_cache_free(tcp_bucket_cachep, tb);
305 spin_unlock(&head->lock);
308 void tcp_put_port(struct sock *sk)
310 local_bh_disable();
311 __tcp_put_port(sk);
312 local_bh_enable();
315 /* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
316 * Look, when several writers sleep and reader wakes them up, all but one
317 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
318 * this, _but_ remember, it adds useless work on UP machines (wake up each
319 * exclusive lock release). It should be ifdefed really.
322 void tcp_listen_wlock(void)
324 write_lock(&tcp_lhash_lock);
326 if (atomic_read(&tcp_lhash_users)) {
327 DECLARE_WAITQUEUE(wait, current);
329 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
330 for (;;) {
331 set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
332 if (atomic_read(&tcp_lhash_users) == 0)
333 break;
334 write_unlock_bh(&tcp_lhash_lock);
335 schedule();
336 write_lock_bh(&tcp_lhash_lock);
339 __set_current_state(TASK_RUNNING);
340 remove_wait_queue(&tcp_lhash_wait, &wait);
344 static __inline__ void __tcp_v4_hash(struct sock *sk)
346 struct sock **skp;
347 rwlock_t *lock;
349 BUG_TRAP(sk->pprev==NULL);
350 if(sk->state == TCP_LISTEN) {
351 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
352 lock = &tcp_lhash_lock;
353 tcp_listen_wlock();
354 } else {
355 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
356 lock = &tcp_ehash[sk->hashent].lock;
357 write_lock(lock);
359 if((sk->next = *skp) != NULL)
360 (*skp)->pprev = &sk->next;
361 *skp = sk;
362 sk->pprev = skp;
363 sock_prot_inc_use(sk->prot);
364 write_unlock(lock);
365 if (sk->state == TCP_LISTEN)
366 wake_up(&tcp_lhash_wait);
369 static void tcp_v4_hash(struct sock *sk)
371 if (sk->state != TCP_CLOSE) {
372 local_bh_disable();
373 __tcp_v4_hash(sk);
374 local_bh_enable();
378 void tcp_unhash(struct sock *sk)
380 rwlock_t *lock;
382 if (sk->state == TCP_LISTEN) {
383 local_bh_disable();
384 tcp_listen_wlock();
385 lock = &tcp_lhash_lock;
386 } else {
387 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
388 lock = &head->lock;
389 write_lock_bh(&head->lock);
392 if(sk->pprev) {
393 if(sk->next)
394 sk->next->pprev = sk->pprev;
395 *sk->pprev = sk->next;
396 sk->pprev = NULL;
397 sock_prot_dec_use(sk->prot);
399 write_unlock_bh(lock);
400 if (sk->state == TCP_LISTEN)
401 wake_up(&tcp_lhash_wait);
404 /* Don't inline this cruft. Here are some nice properties to
405 * exploit here. The BSD API does not allow a listening TCP
406 * to specify the remote port nor the remote address for the
407 * connection. So always assume those are both wildcarded
408 * during the search since they can never be otherwise.
410 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
412 struct sock *result = NULL;
413 int score, hiscore;
415 hiscore=0;
416 for(; sk; sk = sk->next) {
417 if(sk->num == hnum) {
418 __u32 rcv_saddr = sk->rcv_saddr;
420 score = 1;
421 if(rcv_saddr) {
422 if (rcv_saddr != daddr)
423 continue;
424 score++;
426 if (sk->bound_dev_if) {
427 if (sk->bound_dev_if != dif)
428 continue;
429 score++;
431 if (score == 3)
432 return sk;
433 if (score > hiscore) {
434 hiscore = score;
435 result = sk;
439 return result;
442 /* Optimize the common listener case. */
443 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
445 struct sock *sk;
447 read_lock(&tcp_lhash_lock);
448 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
449 if (sk) {
450 if (sk->num == hnum &&
451 sk->next == NULL &&
452 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
453 !sk->bound_dev_if)
454 goto sherry_cache;
455 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
457 if (sk) {
458 sherry_cache:
459 sock_hold(sk);
461 read_unlock(&tcp_lhash_lock);
462 return sk;
465 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
466 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
468 * Local BH must be disabled here.
471 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
472 u32 daddr, u16 hnum, int dif)
474 struct tcp_ehash_bucket *head;
475 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
476 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
477 struct sock *sk;
478 int hash;
480 /* Optimize here for direct hit, only listening connections can
481 * have wildcards anyways.
483 hash = tcp_hashfn(daddr, hnum, saddr, sport);
484 head = &tcp_ehash[hash];
485 read_lock(&head->lock);
486 for(sk = head->chain; sk; sk = sk->next) {
487 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
488 goto hit; /* You sunk my battleship! */
491 /* Must check for a TIME_WAIT'er before going to listener hash. */
492 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
493 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
494 goto hit;
495 read_unlock(&head->lock);
497 return NULL;
499 hit:
500 sock_hold(sk);
501 read_unlock(&head->lock);
502 return sk;
505 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
506 u32 daddr, u16 hnum, int dif)
508 struct sock *sk;
510 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
512 if (sk)
513 return sk;
515 return tcp_v4_lookup_listener(daddr, hnum, dif);
518 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
520 struct sock *sk;
522 local_bh_disable();
523 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
524 local_bh_enable();
526 return sk;
529 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
531 return secure_tcp_sequence_number(skb->nh.iph->daddr,
532 skb->nh.iph->saddr,
533 skb->h.th->dest,
534 skb->h.th->source);
537 static int tcp_v4_check_established(struct sock *sk)
539 u32 daddr = sk->rcv_saddr;
540 u32 saddr = sk->daddr;
541 int dif = sk->bound_dev_if;
542 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
543 __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
544 int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
545 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
546 struct sock *sk2, **skp;
547 struct tcp_tw_bucket *tw;
549 write_lock_bh(&head->lock);
551 /* Check TIME-WAIT sockets first. */
552 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
553 skp = &sk2->next) {
554 tw = (struct tcp_tw_bucket*)sk2;
556 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
557 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
559 /* With PAWS, it is safe from the viewpoint
560 of data integrity. Even without PAWS it
561 is safe provided sequence spaces do not
562 overlap i.e. at data rates <= 80Mbit/sec.
564 Actually, the idea is close to VJ's one,
565 only timestamp cache is held not per host,
566 but per port pair and TW bucket is used
567 as state holder.
569 If TW bucket has been already destroyed we
570 fall back to VJ's scheme and use initial
571 timestamp retrieved from peer table.
573 if (tw->substate == TCP_TIME_WAIT &&
574 sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
575 if ((tp->write_seq = tw->snd_nxt + 2) == 0)
576 tp->write_seq = 1;
577 tp->ts_recent = tw->ts_recent;
578 tp->ts_recent_stamp = tw->ts_recent_stamp;
579 sock_hold(sk2);
580 skp = &head->chain;
581 goto unique;
582 } else
583 goto not_unique;
586 tw = NULL;
588 /* And established part... */
589 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
590 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
591 goto not_unique;
594 unique:
595 BUG_TRAP(sk->pprev==NULL);
596 if ((sk->next = *skp) != NULL)
597 (*skp)->pprev = &sk->next;
599 *skp = sk;
600 sk->pprev = skp;
601 sock_prot_inc_use(sk->prot);
602 write_unlock_bh(&head->lock);
604 if (tw) {
605 /* Silly. Should hash-dance instead... */
606 local_bh_disable();
607 tcp_tw_deschedule(tw);
608 tcp_timewait_kill(tw);
609 NET_INC_STATS_BH(TimeWaitRecycled);
610 local_bh_enable();
612 tcp_tw_put(tw);
615 return 0;
617 not_unique:
618 write_unlock_bh(&head->lock);
619 return -EADDRNOTAVAIL;
622 /* Hash SYN-SENT socket to established hash table after
623 * checking that it is unique. Note, that without kernel lock
624 * we MUST make these two operations atomically.
626 * Optimization: if it is bound and tcp_bind_bucket has the only
627 * owner (us), we need not to scan established bucket.
630 int tcp_v4_hash_connecting(struct sock *sk)
632 unsigned short snum = sk->num;
633 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
634 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
636 spin_lock_bh(&head->lock);
637 if (tb->owners == sk && sk->bind_next == NULL) {
638 __tcp_v4_hash(sk);
639 spin_unlock_bh(&head->lock);
640 return 0;
641 } else {
642 spin_unlock_bh(&head->lock);
644 /* No definite answer... Walk to established hash table */
645 return tcp_v4_check_established(sk);
649 /* This will initiate an outgoing connection. */
650 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
652 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
653 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
654 struct sk_buff *buff;
655 struct rtable *rt;
656 u32 daddr, nexthop;
657 int tmp;
658 int err;
660 if (addr_len < sizeof(struct sockaddr_in))
661 return(-EINVAL);
663 if (usin->sin_family != AF_INET)
664 return(-EAFNOSUPPORT);
666 nexthop = daddr = usin->sin_addr.s_addr;
667 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
668 if (daddr == 0)
669 return -EINVAL;
670 nexthop = sk->protinfo.af_inet.opt->faddr;
673 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
674 RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
675 if (tmp < 0)
676 return tmp;
678 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
679 ip_rt_put(rt);
680 return -ENETUNREACH;
683 __sk_dst_set(sk, &rt->u.dst);
685 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
686 daddr = rt->rt_dst;
688 err = -ENOBUFS;
689 buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL);
691 if (buff == NULL)
692 goto failure;
694 if (!sk->saddr)
695 sk->saddr = rt->rt_src;
696 sk->rcv_saddr = sk->saddr;
698 if (tp->ts_recent_stamp && sk->daddr != daddr) {
699 /* Reset inherited state */
700 tp->ts_recent = 0;
701 tp->ts_recent_stamp = 0;
702 tp->write_seq = 0;
705 if (sysctl_tcp_tw_recycle &&
706 !tp->ts_recent_stamp &&
707 rt->rt_dst == daddr) {
708 struct inet_peer *peer = rt_get_peer(rt);
710 /* VJ's idea. We save last timestamp seen from
711 * the destination in peer table, when entering state TIME-WAIT
712 * and initialize ts_recent from it, when trying new connection.
715 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
716 tp->ts_recent_stamp = peer->tcp_ts_stamp;
717 tp->ts_recent = peer->tcp_ts;
721 sk->dport = usin->sin_port;
722 sk->daddr = daddr;
724 if (!tp->write_seq)
725 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
726 sk->sport, usin->sin_port);
728 tp->ext_header_len = 0;
729 if (sk->protinfo.af_inet.opt)
730 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
732 tp->mss_clamp = 536;
734 err = tcp_connect(sk, buff);
735 if (err == 0)
736 return 0;
738 failure:
739 __sk_dst_reset(sk);
740 sk->dport = 0;
741 return err;
744 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
746 return ((struct rtable*)skb->dst)->rt_iif;
749 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
751 unsigned h = raddr ^ rport;
752 h ^= h>>16;
753 h ^= h>>8;
754 return h&(TCP_SYNQ_HSIZE-1);
757 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
758 struct iphdr *iph,
759 struct tcphdr *th,
760 struct open_request ***prevp)
762 struct tcp_listen_opt *lopt = tp->listen_opt;
763 struct open_request *req, **prev;
764 __u16 rport = th->source;
765 __u32 raddr = iph->saddr;
767 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
768 (req = *prev) != NULL;
769 prev = &req->dl_next) {
770 if (req->rmt_port == rport &&
771 req->af.v4_req.rmt_addr == raddr &&
772 req->af.v4_req.loc_addr == iph->daddr &&
773 TCP_INET_FAMILY(req->class->family)) {
774 BUG_TRAP(req->sk == NULL);
775 *prevp = prev;
776 return req;
780 return NULL;
783 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
785 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
786 struct tcp_listen_opt *lopt = tp->listen_opt;
787 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
789 req->expires = jiffies + TCP_TIMEOUT_INIT;
790 req->retrans = 0;
791 req->sk = NULL;
792 req->index = h;
793 req->dl_next = lopt->syn_table[h];
795 write_lock(&tp->syn_wait_lock);
796 lopt->syn_table[h] = req;
797 write_unlock(&tp->syn_wait_lock);
799 tcp_synq_added(sk);
804 * This routine does path mtu discovery as defined in RFC1191.
806 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
808 struct dst_entry *dst;
809 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
811 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
812 * send out by Linux are always <576bytes so they should go through
813 * unfragmented).
815 if (sk->state == TCP_LISTEN)
816 return;
818 /* We don't check in the destentry if pmtu discovery is forbidden
819 * on this route. We just assume that no packet_to_big packets
820 * are send back when pmtu discovery is not active.
821 * There is a small race when the user changes this flag in the
822 * route, but I think that's acceptable.
824 if ((dst = __sk_dst_check(sk, 0)) == NULL)
825 return;
827 ip_rt_update_pmtu(dst, mtu);
829 /* Something is about to be wrong... Remember soft error
830 * for the case, if this connection will not able to recover.
832 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
833 sk->err_soft = EMSGSIZE;
835 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
836 tp->pmtu_cookie > dst->pmtu) {
837 tcp_sync_mss(sk, dst->pmtu);
839 /* Resend the TCP packet because it's
840 * clear that the old packet has been
841 * dropped. This is the new "fast" path mtu
842 * discovery.
844 tcp_simple_retransmit(sk);
845 } /* else let the usual retransmit timer handle it */
849 * This routine is called by the ICMP module when it gets some
850 * sort of error condition. If err < 0 then the socket should
851 * be closed and the error returned to the user. If err > 0
852 * it's just the icmp type << 8 | icmp code. After adjustment
853 * header points to the first 8 bytes of the tcp header. We need
854 * to find the appropriate port.
856 * The locking strategy used here is very "optimistic". When
857 * someone else accesses the socket the ICMP is just dropped
858 * and for some paths there is no check at all.
859 * A more general error queue to queue errors for later handling
860 * is probably better.
864 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
866 struct iphdr *iph = (struct iphdr*)dp;
867 struct tcphdr *th;
868 struct tcp_opt *tp;
869 int type = skb->h.icmph->type;
870 int code = skb->h.icmph->code;
871 #if ICMP_MIN_LENGTH < 14
872 int no_flags = 0;
873 #else
874 #define no_flags 0
875 #endif
876 struct sock *sk;
877 __u32 seq;
878 int err;
880 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
881 ICMP_INC_STATS_BH(IcmpInErrors);
882 return;
884 #if ICMP_MIN_LENGTH < 14
885 if (len < (iph->ihl << 2) + 14)
886 no_flags = 1;
887 #endif
889 th = (struct tcphdr*)(dp+(iph->ihl<<2));
891 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
892 if (sk == NULL) {
893 ICMP_INC_STATS_BH(IcmpInErrors);
894 return;
896 if (sk->state == TCP_TIME_WAIT) {
897 tcp_tw_put((struct tcp_tw_bucket*)sk);
898 return;
901 bh_lock_sock(sk);
902 /* If too many ICMPs get dropped on busy
903 * servers this needs to be solved differently.
905 if (sk->lock.users != 0)
906 NET_INC_STATS_BH(LockDroppedIcmps);
908 if (sk->state == TCP_CLOSE)
909 goto out;
911 tp = &sk->tp_pinfo.af_tcp;
912 seq = ntohl(th->seq);
913 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
914 NET_INC_STATS(OutOfWindowIcmps);
915 goto out;
918 switch (type) {
919 case ICMP_SOURCE_QUENCH:
920 /* This is deprecated, but if someone generated it,
921 * we have no reasons to ignore it.
923 if (sk->lock.users == 0)
924 tcp_enter_cong_avoid(tp);
925 goto out;
926 case ICMP_PARAMETERPROB:
927 err = EPROTO;
928 break;
929 case ICMP_DEST_UNREACH:
930 if (code > NR_ICMP_UNREACH)
931 goto out;
933 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
934 if (sk->lock.users == 0)
935 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
936 goto out;
939 err = icmp_err_convert[code].errno;
940 break;
941 case ICMP_TIME_EXCEEDED:
942 err = EHOSTUNREACH;
943 break;
944 default:
945 goto out;
948 switch (sk->state) {
949 struct open_request *req, **prev;
950 case TCP_LISTEN:
951 if (sk->lock.users != 0)
952 goto out;
954 /* The final ACK of the handshake should be already
955 * handled in the new socket context, not here.
956 * Strictly speaking - an ICMP error for the final
957 * ACK should set the opening flag, but that is too
958 * complicated right now.
960 if (!no_flags && !th->syn && !th->ack)
961 goto out;
963 req = tcp_v4_search_req(tp, iph, th, &prev);
964 if (!req)
965 goto out;
967 /* ICMPs are not backlogged, hence we cannot get
968 an established socket here.
970 BUG_TRAP(req->sk == NULL);
972 if (seq != req->snt_isn) {
973 NET_INC_STATS_BH(OutOfWindowIcmps);
974 goto out;
978 * Still in SYN_RECV, just remove it silently.
979 * There is no good way to pass the error to the newly
980 * created socket, and POSIX does not want network
981 * errors returned from accept().
983 tcp_synq_drop(sk, req, prev);
984 goto out;
986 case TCP_SYN_SENT:
987 case TCP_SYN_RECV: /* Cannot happen.
988 It can f.e. if SYNs crossed.
990 if (!no_flags && !th->syn)
991 goto out;
992 if (sk->lock.users == 0) {
993 TCP_INC_STATS_BH(TcpAttemptFails);
994 sk->err = err;
996 sk->error_report(sk);
998 tcp_done(sk);
999 } else {
1000 sk->err_soft = err;
1002 goto out;
1005 /* If we've already connected we will keep trying
1006 * until we time out, or the user gives up.
1008 * rfc1122 4.2.3.9 allows to consider as hard errors
1009 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1010 * but it is obsoleted by pmtu discovery).
1012 * Note, that in modern internet, where routing is unreliable
1013 * and in each dark corner broken firewalls sit, sending random
1014 * errors ordered by their masters even this two messages finally lose
1015 * their original sense (even Linux sends invalid PORT_UNREACHs)
1017 * Now we are in compliance with RFCs.
1018 * --ANK (980905)
1021 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1022 sk->err = err;
1023 sk->error_report(sk);
1024 } else { /* Only an error on timeout */
1025 sk->err_soft = err;
1028 out:
1029 bh_unlock_sock(sk);
1030 sock_put(sk);
1033 /* This routine computes an IPv4 TCP checksum. */
1034 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1035 struct sk_buff *skb)
1037 th->check = 0;
1038 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1039 csum_partial((char *)th, th->doff<<2, skb->csum));
1043 * This routine will send an RST to the other tcp.
1045 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1046 * for reset.
1047 * Answer: if a packet caused RST, it is not for a socket
1048 * existing in our system, if it is matched to a socket,
1049 * it is just duplicate segment or bug in other side's TCP.
1050 * So that we build reply only basing on parameters
1051 * arrived with segment.
1052 * Exception: precedence violation. We do not implement it in any case.
1055 static void tcp_v4_send_reset(struct sk_buff *skb)
1057 struct tcphdr *th = skb->h.th;
1058 struct tcphdr rth;
1059 struct ip_reply_arg arg;
1061 /* Never send a reset in response to a reset. */
1062 if (th->rst)
1063 return;
1065 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1066 return;
1068 /* Swap the send and the receive. */
1069 memset(&rth, 0, sizeof(struct tcphdr));
1070 rth.dest = th->source;
1071 rth.source = th->dest;
1072 rth.doff = sizeof(struct tcphdr)/4;
1073 rth.rst = 1;
1075 if (th->ack) {
1076 rth.seq = th->ack_seq;
1077 } else {
1078 rth.ack = 1;
1079 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1080 + skb->len - (th->doff<<2));
1083 memset(&arg, 0, sizeof arg);
1084 arg.iov[0].iov_base = (unsigned char *)&rth;
1085 arg.iov[0].iov_len = sizeof rth;
1086 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1087 skb->nh.iph->saddr, /*XXX*/
1088 sizeof(struct tcphdr),
1089 IPPROTO_TCP,
1090 0);
1091 arg.n_iov = 1;
1092 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1094 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1096 TCP_INC_STATS_BH(TcpOutSegs);
1097 TCP_INC_STATS_BH(TcpOutRsts);
1100 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1101 outside socket context is ugly, certainly. What can I do?
1104 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1106 struct tcphdr *th = skb->h.th;
1107 struct {
1108 struct tcphdr th;
1109 u32 tsopt[3];
1110 } rep;
1111 struct ip_reply_arg arg;
1113 memset(&rep.th, 0, sizeof(struct tcphdr));
1114 memset(&arg, 0, sizeof arg);
1116 arg.iov[0].iov_base = (unsigned char *)&rep;
1117 arg.iov[0].iov_len = sizeof(rep.th);
1118 arg.n_iov = 1;
1119 if (ts) {
1120 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1121 (TCPOPT_NOP << 16) |
1122 (TCPOPT_TIMESTAMP << 8) |
1123 TCPOLEN_TIMESTAMP);
1124 rep.tsopt[1] = htonl(tcp_time_stamp);
1125 rep.tsopt[2] = htonl(ts);
1126 arg.iov[0].iov_len = sizeof(rep);
1129 /* Swap the send and the receive. */
1130 rep.th.dest = th->source;
1131 rep.th.source = th->dest;
1132 rep.th.doff = arg.iov[0].iov_len/4;
1133 rep.th.seq = htonl(seq);
1134 rep.th.ack_seq = htonl(ack);
1135 rep.th.ack = 1;
1136 rep.th.window = htons(win);
1138 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1139 skb->nh.iph->saddr, /*XXX*/
1140 arg.iov[0].iov_len,
1141 IPPROTO_TCP,
1143 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1145 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1147 TCP_INC_STATS_BH(TcpOutSegs);
1150 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1152 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1154 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1155 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1157 tcp_tw_put(tw);
1160 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1162 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1163 req->ts_recent);
1166 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1168 struct rtable *rt;
1169 struct ip_options *opt;
1171 opt = req->af.v4_req.opt;
1172 if(ip_route_output(&rt, ((opt && opt->srr) ?
1173 opt->faddr :
1174 req->af.v4_req.rmt_addr),
1175 req->af.v4_req.loc_addr,
1176 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1177 sk->bound_dev_if)) {
1178 IP_INC_STATS_BH(IpOutNoRoutes);
1179 return NULL;
1181 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1182 ip_rt_put(rt);
1183 IP_INC_STATS_BH(IpOutNoRoutes);
1184 return NULL;
1186 return &rt->u.dst;
1190 * Send a SYN-ACK after having received an ACK.
1191 * This still operates on a open_request only, not on a big
1192 * socket.
1194 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1195 struct dst_entry *dst)
1197 int err = -1;
1198 struct sk_buff * skb;
1200 /* First, grab a route. */
1201 if (dst == NULL &&
1202 (dst = tcp_v4_route_req(sk, req)) == NULL)
1203 goto out;
1205 skb = tcp_make_synack(sk, dst, req);
1207 if (skb) {
1208 struct tcphdr *th = skb->h.th;
1210 th->check = tcp_v4_check(th, skb->len,
1211 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1212 csum_partial((char *)th, skb->len, skb->csum));
1214 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1215 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1216 if (err == NET_XMIT_CN)
1217 err = 0;
1220 out:
1221 dst_release(dst);
1222 return err;
1226 * IPv4 open_request destructor.
1228 static void tcp_v4_or_free(struct open_request *req)
1230 if (req->af.v4_req.opt)
1231 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1234 static inline void syn_flood_warning(struct sk_buff *skb)
1236 static unsigned long warntime;
1238 if (jiffies - warntime > HZ*60) {
1239 warntime = jiffies;
1240 printk(KERN_INFO
1241 "possible SYN flooding on port %d. Sending cookies.\n",
1242 ntohs(skb->h.th->dest));
1247 * Save and compile IPv4 options into the open_request if needed.
1249 static inline struct ip_options *
1250 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1252 struct ip_options *opt = &(IPCB(skb)->opt);
1253 struct ip_options *dopt = NULL;
1255 if (opt && opt->optlen) {
1256 int opt_size = optlength(opt);
1257 dopt = kmalloc(opt_size, GFP_ATOMIC);
1258 if (dopt) {
1259 if (ip_options_echo(dopt, skb)) {
1260 kfree_s(dopt, opt_size);
1261 dopt = NULL;
1265 return dopt;
1269 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1270 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1271 * It would be better to replace it with a global counter for all sockets
1272 * but then some measure against one socket starving all other sockets
1273 * would be needed.
1275 * It was 128 by default. Experiments with real servers show, that
1276 * it is absolutely not enough even at 100conn/sec. 256 cures most
1277 * of problems. This value is adjusted to 128 for very small machines
1278 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1279 * Further increasing requires to change hash table size.
1281 int sysctl_max_syn_backlog = 256;
1283 struct or_calltable or_ipv4 = {
1284 PF_INET,
1285 tcp_v4_send_synack,
1286 tcp_v4_or_send_ack,
1287 tcp_v4_or_free,
1288 tcp_v4_send_reset
1291 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1293 struct tcp_opt tp;
1294 struct open_request *req;
1295 struct tcphdr *th = skb->h.th;
1296 __u32 saddr = skb->nh.iph->saddr;
1297 __u32 daddr = skb->nh.iph->daddr;
1298 __u32 isn = TCP_SKB_CB(skb)->when;
1299 struct dst_entry *dst = NULL;
1300 #ifdef CONFIG_SYN_COOKIES
1301 int want_cookie = 0;
1302 #else
1303 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1304 #endif
1306 /* Never answer to SYNs send to broadcast or multicast */
1307 if (((struct rtable *)skb->dst)->rt_flags &
1308 (RTCF_BROADCAST|RTCF_MULTICAST))
1309 goto drop;
1311 /* TW buckets are converted to open requests without
1312 * limitations, they conserve resources and peer is
1313 * evidently real one.
1315 if (tcp_synq_is_full(sk) && !isn) {
1316 #ifdef CONFIG_SYN_COOKIES
1317 if (sysctl_tcp_syncookies) {
1318 want_cookie = 1;
1319 } else
1320 #endif
1321 goto drop;
1324 /* Accept backlog is full. If we have already queued enough
1325 * of warm entries in syn queue, drop request. It is better than
1326 * clogging syn queue with openreqs with exponentially increasing
1327 * timeout.
1329 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1330 goto drop;
1332 req = tcp_openreq_alloc();
1333 if (req == NULL)
1334 goto drop;
1336 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1337 tp.mss_clamp = 536;
1338 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1340 tcp_parse_options(NULL, th, &tp, want_cookie);
1342 tcp_openreq_init(req, &tp, skb);
1344 req->af.v4_req.loc_addr = daddr;
1345 req->af.v4_req.rmt_addr = saddr;
1346 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1347 req->class = &or_ipv4;
1349 if (want_cookie) {
1350 #ifdef CONFIG_SYN_COOKIES
1351 syn_flood_warning(skb);
1352 #endif
1353 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1354 } else if (isn == 0) {
1355 struct inet_peer *peer = NULL;
1357 /* VJ's idea. We save last timestamp seen
1358 * from the destination in peer table, when entering
1359 * state TIME-WAIT, and check against it before
1360 * accepting new connection request.
1362 * If "isn" is not zero, this request hit alive
1363 * timewait bucket, so that all the necessary checks
1364 * are made in the function processing timewait state.
1366 if (tp.saw_tstamp &&
1367 sysctl_tcp_tw_recycle &&
1368 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1369 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1370 peer->v4daddr == saddr) {
1371 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1372 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1373 NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source)));
1374 NET_INC_STATS_BH(PAWSPassiveRejected);
1375 dst_release(dst);
1376 goto drop_and_free;
1379 /* Kill the following clause, if you dislike this way. */
1380 else if (!sysctl_tcp_syncookies &&
1381 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1382 < (sysctl_max_syn_backlog>>2)) &&
1383 (!peer || !peer->tcp_ts_stamp) &&
1384 (!dst || !dst->rtt)) {
1385 /* Without syncookies last quarter of
1386 * backlog is filled with destinations, proven to be alive.
1387 * It means that we continue to communicate
1388 * to destinations, already remembered
1389 * to the moment of synflood.
1391 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source)));
1392 TCP_INC_STATS_BH(TcpAttemptFails);
1393 dst_release(dst);
1394 goto drop_and_free;
1397 isn = tcp_v4_init_sequence(sk, skb);
1399 req->snt_isn = isn;
1401 if (tcp_v4_send_synack(sk, req, dst))
1402 goto drop_and_free;
1404 if (want_cookie) {
1405 tcp_openreq_free(req);
1406 } else {
1407 tcp_v4_synq_add(sk, req);
1409 return 0;
1411 drop_and_free:
1412 tcp_openreq_free(req);
1413 drop:
1414 TCP_INC_STATS_BH(TcpAttemptFails);
1415 return 0;
1420 * The three way handshake has completed - we got a valid synack -
1421 * now create the new socket.
1423 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1424 struct open_request *req,
1425 struct dst_entry *dst)
1427 struct tcp_opt *newtp;
1428 struct sock *newsk;
1430 if (tcp_acceptq_is_full(sk))
1431 goto exit_overflow;
1433 if (dst == NULL &&
1434 (dst = tcp_v4_route_req(sk, req)) == NULL)
1435 goto exit;
1437 newsk = tcp_create_openreq_child(sk, req, skb);
1438 if (!newsk)
1439 goto exit;
1441 newsk->dst_cache = dst;
1443 newtp = &(newsk->tp_pinfo.af_tcp);
1444 newsk->daddr = req->af.v4_req.rmt_addr;
1445 newsk->saddr = req->af.v4_req.loc_addr;
1446 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1447 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1448 req->af.v4_req.opt = NULL;
1449 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1450 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1451 newtp->ext_header_len = 0;
1452 if (newsk->protinfo.af_inet.opt)
1453 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1455 tcp_sync_mss(newsk, dst->pmtu);
1456 tcp_initialize_rcv_mss(newsk);
1457 newtp->advmss = dst->advmss;
1459 tcp_init_buffer_space(newsk);
1461 __tcp_v4_hash(newsk);
1462 __tcp_inherit_port(sk, newsk);
1464 return newsk;
1466 exit_overflow:
1467 NET_INC_STATS_BH(ListenOverflows);
1468 exit:
1469 NET_INC_STATS_BH(ListenDrops);
1470 dst_release(dst);
1471 return NULL;
1474 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1476 struct open_request *req, **prev;
1477 struct tcphdr *th = skb->h.th;
1478 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1480 /* Find possible connection requests. */
1481 req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1482 if (req)
1483 return tcp_check_req(sk, skb, req, prev);
1485 if (tp->accept_queue) {
1486 struct sock *nsk;
1488 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1489 th->source,
1490 skb->nh.iph->daddr,
1491 ntohs(th->dest),
1492 tcp_v4_iif(skb));
1494 if (nsk) {
1495 if (nsk->state != TCP_TIME_WAIT) {
1496 bh_lock_sock(nsk);
1497 return nsk;
1499 tcp_tw_put((struct tcp_tw_bucket*)sk);
1500 return NULL;
1504 #ifdef CONFIG_SYN_COOKIES
1505 if (!th->rst && (th->syn || th->ack))
1506 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1507 #endif
1508 return sk;
1511 static int tcp_v4_checksum_init(struct sk_buff *skb)
1513 if (skb->ip_summed == CHECKSUM_HW) {
1514 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1515 skb->nh.iph->daddr,skb->csum)) {
1516 NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1517 return -1;
1519 skb->ip_summed = CHECKSUM_UNNECESSARY;
1520 } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
1521 if (skb->len <= 68) {
1522 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1523 skb->nh.iph->daddr,
1524 csum_partial((char *)skb->h.th, skb->len, 0)))
1525 return -1;
1526 skb->ip_summed = CHECKSUM_UNNECESSARY;
1527 } else {
1528 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1529 skb->nh.iph->daddr,0);
1532 return 0;
1536 /* The socket must have it's spinlock held when we get
1537 * here.
1539 * We have a potential double-lock case here, so even when
1540 * doing backlog processing we use the BH locking scheme.
1541 * This is because we cannot sleep with the original spinlock
1542 * held.
1544 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 #ifdef CONFIG_FILTER
1547 struct sk_filter *filter = sk->filter;
1548 if (filter && sk_filter(skb, filter))
1549 goto discard;
1550 #endif /* CONFIG_FILTER */
1552 IP_INC_STATS_BH(IpInDelivers);
1554 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1555 TCP_CHECK_TIMER(sk);
1556 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1557 goto reset;
1558 TCP_CHECK_TIMER(sk);
1559 return 0;
1562 if (tcp_checksum_complete(skb))
1563 goto csum_err;
1565 if (sk->state == TCP_LISTEN) {
1566 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1567 if (!nsk)
1568 goto discard;
1570 if (nsk != sk) {
1571 if (tcp_child_process(sk, nsk, skb))
1572 goto reset;
1573 return 0;
1577 TCP_CHECK_TIMER(sk);
1578 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1579 goto reset;
1580 TCP_CHECK_TIMER(sk);
1581 return 0;
1583 reset:
1584 tcp_v4_send_reset(skb);
1585 discard:
1586 kfree_skb(skb);
1587 /* Be careful here. If this function gets more complicated and
1588 * gcc suffers from register pressure on the x86, sk (in %ebx)
1589 * might be destroyed here. This current version compiles correctly,
1590 * but you have been warned.
1592 return 0;
1594 csum_err:
1595 TCP_INC_STATS_BH(TcpInErrs);
1596 goto discard;
1600 * From tcp_input.c
1603 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1605 struct tcphdr *th;
1606 struct sock *sk;
1607 int ret;
1609 if (skb->pkt_type!=PACKET_HOST)
1610 goto discard_it;
1612 th = skb->h.th;
1614 /* Pull up the IP header. */
1615 __skb_pull(skb, skb->h.raw - skb->data);
1617 /* Count it even if it's bad */
1618 TCP_INC_STATS_BH(TcpInSegs);
1620 if (len < sizeof(struct tcphdr))
1621 goto bad_packet;
1623 if (tcp_v4_checksum_init(skb) < 0)
1624 goto bad_packet;
1626 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1627 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1628 len - th->doff*4);
1629 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1630 TCP_SKB_CB(skb)->when = 0;
1631 skb->used = 0;
1633 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1634 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1636 if (!sk)
1637 goto no_tcp_socket;
1639 process:
1640 if(!ipsec_sk_policy(sk,skb))
1641 goto discard_and_relse;
1643 if (sk->state == TCP_TIME_WAIT)
1644 goto do_time_wait;
1646 bh_lock_sock(sk);
1647 ret = 0;
1648 if (!sk->lock.users) {
1649 if (!tcp_prequeue(sk, skb))
1650 ret = tcp_v4_do_rcv(sk, skb);
1651 } else
1652 sk_add_backlog(sk, skb);
1653 bh_unlock_sock(sk);
1655 sock_put(sk);
1657 return ret;
1659 no_tcp_socket:
1660 if (tcp_checksum_complete(skb)) {
1661 bad_packet:
1662 TCP_INC_STATS_BH(TcpInErrs);
1663 } else {
1664 tcp_v4_send_reset(skb);
1667 discard_it:
1668 /* Discard frame. */
1669 kfree_skb(skb);
1670 return 0;
1672 discard_and_relse:
1673 sock_put(sk);
1674 goto discard_it;
1676 do_time_wait:
1677 if (tcp_checksum_complete(skb)) {
1678 TCP_INC_STATS_BH(TcpInErrs);
1679 goto discard_and_relse;
1681 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1682 skb, th, skb->len)) {
1683 case TCP_TW_SYN:
1685 struct sock *sk2;
1687 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1688 if (sk2 != NULL) {
1689 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1690 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1691 tcp_tw_put((struct tcp_tw_bucket *)sk);
1692 sk = sk2;
1693 goto process;
1695 /* Fall through to ACK */
1697 case TCP_TW_ACK:
1698 tcp_v4_timewait_ack(sk, skb);
1699 break;
1700 case TCP_TW_RST:
1701 goto no_tcp_socket;
1702 case TCP_TW_SUCCESS:
1704 goto discard_it;
1707 /* With per-bucket locks this operation is not-atomic, so that
1708 * this version is not worse.
1710 static void __tcp_v4_rehash(struct sock *sk)
1712 sk->prot->unhash(sk);
1713 sk->prot->hash(sk);
1716 int tcp_v4_rebuild_header(struct sock *sk)
1718 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1719 __u32 new_saddr;
1720 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1722 if (rt == NULL) {
1723 int err;
1725 u32 daddr = sk->daddr;
1727 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1728 daddr = sk->protinfo.af_inet.opt->faddr;
1730 err = ip_route_output(&rt, daddr, sk->saddr,
1731 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1732 sk->bound_dev_if);
1733 if (err) {
1734 sk->err_soft=-err;
1735 sk->error_report(sk);
1736 return -1;
1738 __sk_dst_set(sk, &rt->u.dst);
1741 /* Force route checking if want_rewrite.
1742 * The idea is good, the implementation is disguisting.
1743 * Well, if I made bind on this socket, you cannot randomly ovewrite
1744 * its source address. --ANK
1746 if (want_rewrite) {
1747 int tmp;
1748 struct rtable *new_rt;
1749 __u32 old_saddr = rt->rt_src;
1751 /* Query new route using another rt buffer */
1752 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1753 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1754 sk->bound_dev_if);
1756 /* Only useful if different source addrs */
1757 if (tmp == 0) {
1759 * Only useful if different source addrs
1761 if (new_rt->rt_src != old_saddr ) {
1762 __sk_dst_set(sk, &new_rt->u.dst);
1763 rt = new_rt;
1764 goto do_rewrite;
1766 dst_release(&new_rt->u.dst);
1770 return 0;
1772 do_rewrite:
1773 new_saddr = rt->rt_src;
1775 /* Ouch!, this should not happen. */
1776 if (!sk->saddr || !sk->rcv_saddr) {
1777 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1778 "saddr=%08X rcv_saddr=%08X\n",
1779 ntohl(sk->saddr),
1780 ntohl(sk->rcv_saddr));
1781 return -1;
1784 if (new_saddr != sk->saddr) {
1785 if (sysctl_ip_dynaddr > 1) {
1786 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1787 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1788 NIPQUAD(sk->saddr),
1789 NIPQUAD(new_saddr));
1792 sk->saddr = new_saddr;
1793 sk->rcv_saddr = new_saddr;
1795 /* XXX The only one ugly spot where we need to
1796 * XXX really change the sockets identity after
1797 * XXX it has entered the hashes. -DaveM
1799 * Besides that, it does not check for connection
1800 * uniqueness. Wait for troubles.
1802 __tcp_v4_rehash(sk);
1805 return 0;
1808 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1810 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1812 sin->sin_family = AF_INET;
1813 sin->sin_addr.s_addr = sk->daddr;
1814 sin->sin_port = sk->dport;
1817 /* VJ's idea. Save last timestamp seen from this destination
1818 * and hold it at least for normal timewait interval to use for duplicate
1819 * segment detection in subsequent connections, before they enter synchronized
1820 * state.
1823 int tcp_v4_remember_stamp(struct sock *sk)
1825 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1826 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1827 struct inet_peer *peer = NULL;
1828 int release_it = 0;
1830 if (rt == NULL || rt->rt_dst != sk->daddr) {
1831 peer = inet_getpeer(sk->daddr, 1);
1832 release_it = 1;
1833 } else {
1834 if (rt->peer == NULL)
1835 rt_bind_peer(rt, 1);
1836 peer = rt->peer;
1839 if (peer) {
1840 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1841 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1842 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1843 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1844 peer->tcp_ts = tp->ts_recent;
1846 if (release_it)
1847 inet_putpeer(peer);
1848 return 1;
1851 return 0;
1854 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1856 struct inet_peer *peer = NULL;
1858 peer = inet_getpeer(tw->daddr, 1);
1860 if (peer) {
1861 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1862 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1863 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1864 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1865 peer->tcp_ts = tw->ts_recent;
1867 inet_putpeer(peer);
1868 return 1;
1871 return 0;
1874 struct tcp_func ipv4_specific = {
1875 ip_queue_xmit,
1876 tcp_v4_send_check,
1877 tcp_v4_rebuild_header,
1878 tcp_v4_conn_request,
1879 tcp_v4_syn_recv_sock,
1880 tcp_v4_hash_connecting,
1881 tcp_v4_remember_stamp,
1882 sizeof(struct iphdr),
1884 ip_setsockopt,
1885 ip_getsockopt,
1886 v4_addr2sockaddr,
1887 sizeof(struct sockaddr_in)
1890 /* NOTE: A lot of things set to zero explicitly by call to
1891 * sk_alloc() so need not be done here.
1893 static int tcp_v4_init_sock(struct sock *sk)
1895 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1897 skb_queue_head_init(&tp->out_of_order_queue);
1898 tcp_init_xmit_timers(sk);
1899 tcp_prequeue_init(tp);
1901 tp->rto = TCP_TIMEOUT_INIT;
1902 tp->mdev = TCP_TIMEOUT_INIT;
1904 /* So many TCP implementations out there (incorrectly) count the
1905 * initial SYN frame in their delayed-ACK and congestion control
1906 * algorithms that we must have the following bandaid to talk
1907 * efficiently to them. -DaveM
1909 tp->snd_cwnd = 2;
1911 /* See draft-stevens-tcpca-spec-01 for discussion of the
1912 * initialization of these values.
1914 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1915 tp->snd_cwnd_clamp = ~0;
1916 tp->mss_cache = 536;
1918 sk->state = TCP_CLOSE;
1920 sk->write_space = tcp_write_space;
1922 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1924 return 0;
1927 static int tcp_v4_destroy_sock(struct sock *sk)
1929 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1931 tcp_clear_xmit_timers(sk);
1933 /* Cleanup up the write buffer. */
1934 __skb_queue_purge(&sk->write_queue);
1936 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1937 __skb_queue_purge(&tp->out_of_order_queue);
1939 /* Clean prequeue, it must be empty really */
1940 __skb_queue_purge(&tp->ucopy.prequeue);
1942 /* Clean up a referenced TCP bind bucket. */
1943 if(sk->prev != NULL)
1944 tcp_put_port(sk);
1946 return 0;
1949 /* Proc filesystem TCP sock list dumping. */
1950 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
1952 int ttd = req->expires - jiffies;
1954 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1955 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1957 req->af.v4_req.loc_addr,
1958 ntohs(sk->sport),
1959 req->af.v4_req.rmt_addr,
1960 ntohs(req->rmt_port),
1961 TCP_SYN_RECV,
1962 0,0, /* could print option size, but that is af dependent. */
1963 1, /* timers active (only the expire timer) */
1964 ttd,
1965 req->retrans,
1966 sk->socket ? sk->socket->inode->i_uid : 0,
1967 0, /* non standard timer */
1968 0, /* open_requests have no inode */
1969 atomic_read(&sk->refcnt),
1974 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1976 unsigned int dest, src;
1977 __u16 destp, srcp;
1978 int timer_active;
1979 unsigned long timer_expires;
1980 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1982 dest = sp->daddr;
1983 src = sp->rcv_saddr;
1984 destp = ntohs(sp->dport);
1985 srcp = ntohs(sp->sport);
1986 timer_active = 0;
1987 timer_expires = (unsigned) -1;
1988 if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) {
1989 timer_active = 1;
1990 timer_expires = tp->retransmit_timer.expires;
1991 } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) {
1992 timer_active = 4;
1993 timer_expires = tp->probe_timer.expires;
1995 if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) {
1996 timer_active = 2;
1997 timer_expires = sp->timer.expires;
1999 if(timer_active == 0)
2000 timer_expires = jiffies;
2002 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2003 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u",
2004 i, src, srcp, dest, destp, sp->state,
2005 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2006 timer_active, timer_expires-jiffies,
2007 tp->retransmits,
2008 sp->socket ? sp->socket->inode->i_uid : 0,
2009 tp->probes_out,
2010 sp->socket ? sp->socket->inode->i_ino : 0,
2011 atomic_read(&sp->refcnt), sp,
2012 tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong
2016 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2018 unsigned int dest, src;
2019 __u16 destp, srcp;
2020 int ttd = tw->ttd - jiffies;
2022 if (ttd < 0)
2023 ttd = 0;
2025 dest = tw->daddr;
2026 src = tw->rcv_saddr;
2027 destp = ntohs(tw->dport);
2028 srcp = ntohs(tw->sport);
2030 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2031 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2032 i, src, srcp, dest, destp, tw->substate, 0, 0,
2033 3, ttd, 0, 0, 0, 0,
2034 atomic_read(&tw->refcnt), tw);
2037 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2039 int len = 0, num = 0, i;
2040 off_t begin, pos = 0;
2041 char tmpbuf[129];
2043 if (offset < 128)
2044 len += sprintf(buffer, "%-127s\n",
2045 " sl local_address rem_address st tx_queue "
2046 "rx_queue tr tm->when retrnsmt uid timeout inode");
2048 pos = 128;
2050 /* First, walk listening socket table. */
2051 tcp_listen_lock();
2052 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2053 struct sock *sk = tcp_listening_hash[i];
2054 struct tcp_listen_opt *lopt;
2055 int k;
2057 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2058 struct open_request *req;
2059 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2061 if (!TCP_INET_FAMILY(sk->family))
2062 goto skip_listen;
2064 pos += 128;
2065 if (pos >= offset) {
2066 get_tcp_sock(sk, tmpbuf, num);
2067 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2068 if (len >= length) {
2069 tcp_listen_unlock();
2070 goto out_no_bh;
2074 skip_listen:
2075 read_lock_bh(&tp->syn_wait_lock);
2076 lopt = tp->listen_opt;
2077 if (lopt && lopt->qlen != 0) {
2078 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2079 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2080 if (!TCP_INET_FAMILY(req->class->family))
2081 continue;
2083 pos += 128;
2084 if (pos < offset)
2085 continue;
2086 get_openreq(sk, req, tmpbuf, num);
2087 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2088 if(len >= length) {
2089 read_unlock_bh(&tp->syn_wait_lock);
2090 tcp_listen_unlock();
2091 goto out_no_bh;
2096 read_unlock_bh(&tp->syn_wait_lock);
2098 /* Completed requests are in normal socket hash table */
2101 tcp_listen_unlock();
2103 local_bh_disable();
2105 /* Next, walk established hash chain. */
2106 for (i = 0; i < tcp_ehash_size; i++) {
2107 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2108 struct sock *sk;
2109 struct tcp_tw_bucket *tw;
2111 read_lock(&head->lock);
2112 for(sk = head->chain; sk; sk = sk->next, num++) {
2113 if (!TCP_INET_FAMILY(sk->family))
2114 continue;
2115 pos += 128;
2116 if (pos < offset)
2117 continue;
2118 get_tcp_sock(sk, tmpbuf, num);
2119 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2120 if(len >= length) {
2121 read_unlock(&head->lock);
2122 goto out;
2125 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2126 tw != NULL;
2127 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2128 if (!TCP_INET_FAMILY(tw->family))
2129 continue;
2130 pos += 128;
2131 if (pos < offset)
2132 continue;
2133 get_timewait_sock(tw, tmpbuf, num);
2134 len += sprintf(buffer+len, "%-127s\n", tmpbuf);
2135 if(len >= length) {
2136 read_unlock(&head->lock);
2137 goto out;
2140 read_unlock(&head->lock);
2143 out:
2144 local_bh_enable();
2145 out_no_bh:
2147 begin = len - (pos - offset);
2148 *start = buffer + begin;
2149 len -= begin;
2150 if(len > length)
2151 len = length;
2152 if (len < 0)
2153 len = 0;
2154 return len;
2157 struct proto tcp_prot = {
2158 tcp_close, /* close */
2159 tcp_v4_connect, /* connect */
2160 tcp_disconnect, /* disconnect */
2161 tcp_accept, /* accept */
2162 tcp_ioctl, /* ioctl */
2163 tcp_v4_init_sock, /* init */
2164 tcp_v4_destroy_sock, /* destroy */
2165 tcp_shutdown, /* shutdown */
2166 tcp_setsockopt, /* setsockopt */
2167 tcp_getsockopt, /* getsockopt */
2168 tcp_sendmsg, /* sendmsg */
2169 tcp_recvmsg, /* recvmsg */
2170 NULL, /* bind */
2171 tcp_v4_do_rcv, /* backlog_rcv */
2172 tcp_v4_hash, /* hash */
2173 tcp_unhash, /* unhash */
2174 tcp_v4_get_port, /* get_port */
2175 "TCP", /* name */
2180 void __init tcp_v4_init(struct net_proto_family *ops)
2182 int err;
2184 tcp_inode.i_mode = S_IFSOCK;
2185 tcp_inode.i_sock = 1;
2186 tcp_inode.i_uid = 0;
2187 tcp_inode.i_gid = 0;
2188 init_waitqueue_head(&tcp_inode.i_wait);
2189 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2191 tcp_socket->inode = &tcp_inode;
2192 tcp_socket->state = SS_UNCONNECTED;
2193 tcp_socket->type=SOCK_RAW;
2195 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2196 panic("Failed to create the TCP control socket.\n");
2197 tcp_socket->sk->allocation=GFP_ATOMIC;
2198 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2200 /* Unhash it so that IP input processing does not even
2201 * see it, we do not wish this socket to see incoming
2202 * packets.
2204 tcp_socket->sk->prot->unhash(tcp_socket->sk);