Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blobc2cc4815bc2a3d937b89ee9ffc00309621e02efa
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.222 2000/12/08 17:15:53 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/cache.h>
55 #include <linux/init.h>
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
60 #include <net/inet_common.h>
62 #include <linux/inet.h>
63 #include <linux/stddef.h>
64 #include <linux/ipsec.h>
66 extern int sysctl_ip_dynaddr;
68 /* Check TCP sequence numbers in ICMP packets. */
69 #define ICMP_MIN_LENGTH 8
71 /* Socket used for sending RSTs */
72 static struct inode tcp_inode;
73 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
75 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
76 struct sk_buff *skb);
79 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
81 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
82 __tcp_ehash: NULL,
83 __tcp_bhash: NULL,
84 __tcp_bhash_size: 0,
85 __tcp_ehash_size: 0,
86 __tcp_listening_hash: { NULL, },
87 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
88 __tcp_lhash_users: ATOMIC_INIT(0),
89 __tcp_lhash_wait:
90 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
91 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
95 * This array holds the first and last local port number.
96 * For high-usage systems, use sysctl to change this to
97 * 32768-61000
99 int sysctl_local_port_range[2] = { 1024, 4999 };
100 int tcp_port_rover = (1024 - 1);
102 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
103 __u32 faddr, __u16 fport)
105 int h = ((laddr ^ lport) ^ (faddr ^ fport));
106 h ^= h>>16;
107 h ^= h>>8;
108 return h & (tcp_ehash_size - 1);
111 static __inline__ int tcp_sk_hashfn(struct sock *sk)
113 __u32 laddr = sk->rcv_saddr;
114 __u16 lport = sk->num;
115 __u32 faddr = sk->daddr;
116 __u16 fport = sk->dport;
118 return tcp_hashfn(laddr, lport, faddr, fport);
121 /* Allocate and initialize a new TCP local port bind bucket.
122 * The bindhash mutex for snum's hash chain must be held here.
124 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
125 unsigned short snum)
127 struct tcp_bind_bucket *tb;
129 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
130 if(tb != NULL) {
131 tb->port = snum;
132 tb->fastreuse = 0;
133 tb->owners = NULL;
134 if((tb->next = head->chain) != NULL)
135 tb->next->pprev = &tb->next;
136 head->chain = tb;
137 tb->pprev = &head->chain;
139 return tb;
142 /* Caller must disable local BH processing. */
143 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
145 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
146 struct tcp_bind_bucket *tb;
148 spin_lock(&head->lock);
149 tb = (struct tcp_bind_bucket *)sk->prev;
150 if ((child->bind_next = tb->owners) != NULL)
151 tb->owners->bind_pprev = &child->bind_next;
152 tb->owners = child;
153 child->bind_pprev = &tb->owners;
154 child->prev = (struct sock *) tb;
155 spin_unlock(&head->lock);
158 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
160 local_bh_disable();
161 __tcp_inherit_port(sk, child);
162 local_bh_enable();
165 /* Obtain a reference to a local port for the given sock,
166 * if snum is zero it means select any available local port.
168 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
170 struct tcp_bind_hashbucket *head;
171 struct tcp_bind_bucket *tb;
172 int ret;
174 local_bh_disable();
175 if (snum == 0) {
176 int low = sysctl_local_port_range[0];
177 int high = sysctl_local_port_range[1];
178 int remaining = (high - low) + 1;
179 int rover;
181 spin_lock(&tcp_portalloc_lock);
182 rover = tcp_port_rover;
183 do { rover++;
184 if ((rover < low) || (rover > high))
185 rover = low;
186 head = &tcp_bhash[tcp_bhashfn(rover)];
187 spin_lock(&head->lock);
188 for (tb = head->chain; tb; tb = tb->next)
189 if (tb->port == rover)
190 goto next;
191 break;
192 next:
193 spin_unlock(&head->lock);
194 } while (--remaining > 0);
195 tcp_port_rover = rover;
196 spin_unlock(&tcp_portalloc_lock);
198 /* Exhausted local port range during search? */
199 ret = 1;
200 if (remaining <= 0)
201 goto fail;
203 /* OK, here is the one we will use. HEAD is
204 * non-NULL and we hold it's mutex.
206 snum = rover;
207 tb = NULL;
208 } else {
209 head = &tcp_bhash[tcp_bhashfn(snum)];
210 spin_lock(&head->lock);
211 for (tb = head->chain; tb != NULL; tb = tb->next)
212 if (tb->port == snum)
213 break;
215 if (tb != NULL && tb->owners != NULL) {
216 if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
217 goto success;
218 } else {
219 struct sock *sk2 = tb->owners;
220 int sk_reuse = sk->reuse;
222 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
223 if (sk != sk2 &&
224 sk->bound_dev_if == sk2->bound_dev_if) {
225 if (!sk_reuse ||
226 !sk2->reuse ||
227 sk2->state == TCP_LISTEN) {
228 if (!sk2->rcv_saddr ||
229 !sk->rcv_saddr ||
230 (sk2->rcv_saddr == sk->rcv_saddr))
231 break;
235 /* If we found a conflict, fail. */
236 ret = 1;
237 if (sk2 != NULL)
238 goto fail_unlock;
241 ret = 1;
242 if (tb == NULL &&
243 (tb = tcp_bucket_create(head, snum)) == NULL)
244 goto fail_unlock;
245 if (tb->owners == NULL) {
246 if (sk->reuse && sk->state != TCP_LISTEN)
247 tb->fastreuse = 1;
248 else
249 tb->fastreuse = 0;
250 } else if (tb->fastreuse &&
251 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
252 tb->fastreuse = 0;
253 success:
254 sk->num = snum;
255 if (sk->prev == NULL) {
256 if ((sk->bind_next = tb->owners) != NULL)
257 tb->owners->bind_pprev = &sk->bind_next;
258 tb->owners = sk;
259 sk->bind_pprev = &tb->owners;
260 sk->prev = (struct sock *) tb;
261 } else {
262 BUG_TRAP(sk->prev == (struct sock *) tb);
264 ret = 0;
266 fail_unlock:
267 spin_unlock(&head->lock);
268 fail:
269 local_bh_enable();
270 return ret;
273 /* Get rid of any references to a local port held by the
274 * given sock.
276 __inline__ void __tcp_put_port(struct sock *sk)
278 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
279 struct tcp_bind_bucket *tb;
281 spin_lock(&head->lock);
282 tb = (struct tcp_bind_bucket *) sk->prev;
283 if (sk->bind_next)
284 sk->bind_next->bind_pprev = sk->bind_pprev;
285 *(sk->bind_pprev) = sk->bind_next;
286 sk->prev = NULL;
287 sk->num = 0;
288 if (tb->owners == NULL) {
289 if (tb->next)
290 tb->next->pprev = tb->pprev;
291 *(tb->pprev) = tb->next;
292 kmem_cache_free(tcp_bucket_cachep, tb);
294 spin_unlock(&head->lock);
297 void tcp_put_port(struct sock *sk)
299 local_bh_disable();
300 __tcp_put_port(sk);
301 local_bh_enable();
304 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
305 * Look, when several writers sleep and reader wakes them up, all but one
306 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
307 * this, _but_ remember, it adds useless work on UP machines (wake up each
308 * exclusive lock release). It should be ifdefed really.
311 void tcp_listen_wlock(void)
313 write_lock(&tcp_lhash_lock);
315 if (atomic_read(&tcp_lhash_users)) {
316 DECLARE_WAITQUEUE(wait, current);
318 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
319 for (;;) {
320 set_current_state(TASK_UNINTERRUPTIBLE);
321 if (atomic_read(&tcp_lhash_users) == 0)
322 break;
323 write_unlock_bh(&tcp_lhash_lock);
324 schedule();
325 write_lock_bh(&tcp_lhash_lock);
328 __set_current_state(TASK_RUNNING);
329 remove_wait_queue(&tcp_lhash_wait, &wait);
333 static __inline__ void __tcp_v4_hash(struct sock *sk)
335 struct sock **skp;
336 rwlock_t *lock;
338 BUG_TRAP(sk->pprev==NULL);
339 if(sk->state == TCP_LISTEN) {
340 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
341 lock = &tcp_lhash_lock;
342 tcp_listen_wlock();
343 } else {
344 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
345 lock = &tcp_ehash[sk->hashent].lock;
346 write_lock(lock);
348 if((sk->next = *skp) != NULL)
349 (*skp)->pprev = &sk->next;
350 *skp = sk;
351 sk->pprev = skp;
352 sock_prot_inc_use(sk->prot);
353 write_unlock(lock);
354 if (sk->state == TCP_LISTEN)
355 wake_up(&tcp_lhash_wait);
358 static void tcp_v4_hash(struct sock *sk)
360 if (sk->state != TCP_CLOSE) {
361 local_bh_disable();
362 __tcp_v4_hash(sk);
363 local_bh_enable();
367 void tcp_unhash(struct sock *sk)
369 rwlock_t *lock;
371 if (sk->state == TCP_LISTEN) {
372 local_bh_disable();
373 tcp_listen_wlock();
374 lock = &tcp_lhash_lock;
375 } else {
376 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
377 lock = &head->lock;
378 write_lock_bh(&head->lock);
381 if(sk->pprev) {
382 if(sk->next)
383 sk->next->pprev = sk->pprev;
384 *sk->pprev = sk->next;
385 sk->pprev = NULL;
386 sock_prot_dec_use(sk->prot);
388 write_unlock_bh(lock);
389 if (sk->state == TCP_LISTEN)
390 wake_up(&tcp_lhash_wait);
393 /* Don't inline this cruft. Here are some nice properties to
394 * exploit here. The BSD API does not allow a listening TCP
395 * to specify the remote port nor the remote address for the
396 * connection. So always assume those are both wildcarded
397 * during the search since they can never be otherwise.
399 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
401 struct sock *result = NULL;
402 int score, hiscore;
404 hiscore=0;
405 for(; sk; sk = sk->next) {
406 if(sk->num == hnum) {
407 __u32 rcv_saddr = sk->rcv_saddr;
409 score = 1;
410 if(rcv_saddr) {
411 if (rcv_saddr != daddr)
412 continue;
413 score++;
415 if (sk->bound_dev_if) {
416 if (sk->bound_dev_if != dif)
417 continue;
418 score++;
420 if (score == 3)
421 return sk;
422 if (score > hiscore) {
423 hiscore = score;
424 result = sk;
428 return result;
431 /* Optimize the common listener case. */
432 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
434 struct sock *sk;
436 read_lock(&tcp_lhash_lock);
437 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
438 if (sk) {
439 if (sk->num == hnum &&
440 sk->next == NULL &&
441 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
442 !sk->bound_dev_if)
443 goto sherry_cache;
444 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
446 if (sk) {
447 sherry_cache:
448 sock_hold(sk);
450 read_unlock(&tcp_lhash_lock);
451 return sk;
454 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
455 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
457 * Local BH must be disabled here.
460 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
461 u32 daddr, u16 hnum, int dif)
463 struct tcp_ehash_bucket *head;
464 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
465 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
466 struct sock *sk;
467 int hash;
469 /* Optimize here for direct hit, only listening connections can
470 * have wildcards anyways.
472 hash = tcp_hashfn(daddr, hnum, saddr, sport);
473 head = &tcp_ehash[hash];
474 read_lock(&head->lock);
475 for(sk = head->chain; sk; sk = sk->next) {
476 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
477 goto hit; /* You sunk my battleship! */
480 /* Must check for a TIME_WAIT'er before going to listener hash. */
481 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
482 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
483 goto hit;
484 read_unlock(&head->lock);
486 return NULL;
488 hit:
489 sock_hold(sk);
490 read_unlock(&head->lock);
491 return sk;
494 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
495 u32 daddr, u16 hnum, int dif)
497 struct sock *sk;
499 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
501 if (sk)
502 return sk;
504 return tcp_v4_lookup_listener(daddr, hnum, dif);
507 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
509 struct sock *sk;
511 local_bh_disable();
512 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
513 local_bh_enable();
515 return sk;
518 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
520 return secure_tcp_sequence_number(skb->nh.iph->daddr,
521 skb->nh.iph->saddr,
522 skb->h.th->dest,
523 skb->h.th->source);
526 static int tcp_v4_check_established(struct sock *sk)
528 u32 daddr = sk->rcv_saddr;
529 u32 saddr = sk->daddr;
530 int dif = sk->bound_dev_if;
531 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
532 __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
533 int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
534 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
535 struct sock *sk2, **skp;
536 struct tcp_tw_bucket *tw;
538 write_lock_bh(&head->lock);
540 /* Check TIME-WAIT sockets first. */
541 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
542 skp = &sk2->next) {
543 tw = (struct tcp_tw_bucket*)sk2;
545 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
546 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
548 /* With PAWS, it is safe from the viewpoint
549 of data integrity. Even without PAWS it
550 is safe provided sequence spaces do not
551 overlap i.e. at data rates <= 80Mbit/sec.
553 Actually, the idea is close to VJ's one,
554 only timestamp cache is held not per host,
555 but per port pair and TW bucket is used
556 as state holder.
558 If TW bucket has been already destroyed we
559 fall back to VJ's scheme and use initial
560 timestamp retrieved from peer table.
562 if (tw->ts_recent_stamp) {
563 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
564 tp->write_seq = 1;
565 tp->ts_recent = tw->ts_recent;
566 tp->ts_recent_stamp = tw->ts_recent_stamp;
567 sock_hold(sk2);
568 skp = &head->chain;
569 goto unique;
570 } else
571 goto not_unique;
574 tw = NULL;
576 /* And established part... */
577 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
578 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
579 goto not_unique;
582 unique:
583 BUG_TRAP(sk->pprev==NULL);
584 if ((sk->next = *skp) != NULL)
585 (*skp)->pprev = &sk->next;
587 *skp = sk;
588 sk->pprev = skp;
589 sk->hashent = hash;
590 sock_prot_inc_use(sk->prot);
591 write_unlock_bh(&head->lock);
593 if (tw) {
594 /* Silly. Should hash-dance instead... */
595 local_bh_disable();
596 tcp_tw_deschedule(tw);
597 tcp_timewait_kill(tw);
598 NET_INC_STATS_BH(TimeWaitRecycled);
599 local_bh_enable();
601 tcp_tw_put(tw);
604 return 0;
606 not_unique:
607 write_unlock_bh(&head->lock);
608 return -EADDRNOTAVAIL;
611 /* Hash SYN-SENT socket to established hash table after
612 * checking that it is unique. Note, that without kernel lock
613 * we MUST make these two operations atomically.
615 * Optimization: if it is bound and tcp_bind_bucket has the only
616 * owner (us), we need not to scan established bucket.
619 int tcp_v4_hash_connecting(struct sock *sk)
621 unsigned short snum = sk->num;
622 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
623 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
625 spin_lock_bh(&head->lock);
626 if (tb->owners == sk && sk->bind_next == NULL) {
627 __tcp_v4_hash(sk);
628 spin_unlock_bh(&head->lock);
629 return 0;
630 } else {
631 spin_unlock_bh(&head->lock);
633 /* No definite answer... Walk to established hash table */
634 return tcp_v4_check_established(sk);
638 /* This will initiate an outgoing connection. */
639 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
641 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
642 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
643 struct sk_buff *buff;
644 struct rtable *rt;
645 u32 daddr, nexthop;
646 int tmp;
647 int err;
649 if (addr_len < sizeof(struct sockaddr_in))
650 return(-EINVAL);
652 if (usin->sin_family != AF_INET)
653 return(-EAFNOSUPPORT);
655 nexthop = daddr = usin->sin_addr.s_addr;
656 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
657 if (daddr == 0)
658 return -EINVAL;
659 nexthop = sk->protinfo.af_inet.opt->faddr;
662 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
663 RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
664 if (tmp < 0)
665 return tmp;
667 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
668 ip_rt_put(rt);
669 return -ENETUNREACH;
672 __sk_dst_set(sk, &rt->u.dst);
674 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
675 daddr = rt->rt_dst;
677 err = -ENOBUFS;
678 buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
680 if (buff == NULL)
681 goto failure;
683 if (!sk->saddr)
684 sk->saddr = rt->rt_src;
685 sk->rcv_saddr = sk->saddr;
687 if (tp->ts_recent_stamp && sk->daddr != daddr) {
688 /* Reset inherited state */
689 tp->ts_recent = 0;
690 tp->ts_recent_stamp = 0;
691 tp->write_seq = 0;
694 if (sysctl_tcp_tw_recycle &&
695 !tp->ts_recent_stamp &&
696 rt->rt_dst == daddr) {
697 struct inet_peer *peer = rt_get_peer(rt);
699 /* VJ's idea. We save last timestamp seen from
700 * the destination in peer table, when entering state TIME-WAIT
701 * and initialize ts_recent from it, when trying new connection.
704 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
705 tp->ts_recent_stamp = peer->tcp_ts_stamp;
706 tp->ts_recent = peer->tcp_ts;
710 sk->dport = usin->sin_port;
711 sk->daddr = daddr;
713 if (!tp->write_seq)
714 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
715 sk->sport, usin->sin_port);
717 tp->ext_header_len = 0;
718 if (sk->protinfo.af_inet.opt)
719 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
721 tp->mss_clamp = 536;
723 err = tcp_connect(sk, buff);
724 if (err == 0)
725 return 0;
727 failure:
728 __sk_dst_reset(sk);
729 sk->dport = 0;
730 return err;
733 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
735 return ((struct rtable*)skb->dst)->rt_iif;
738 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
740 unsigned h = raddr ^ rport;
741 h ^= h>>16;
742 h ^= h>>8;
743 return h&(TCP_SYNQ_HSIZE-1);
746 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
747 struct iphdr *iph,
748 struct tcphdr *th,
749 struct open_request ***prevp)
751 struct tcp_listen_opt *lopt = tp->listen_opt;
752 struct open_request *req, **prev;
753 __u16 rport = th->source;
754 __u32 raddr = iph->saddr;
756 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
757 (req = *prev) != NULL;
758 prev = &req->dl_next) {
759 if (req->rmt_port == rport &&
760 req->af.v4_req.rmt_addr == raddr &&
761 req->af.v4_req.loc_addr == iph->daddr &&
762 TCP_INET_FAMILY(req->class->family)) {
763 BUG_TRAP(req->sk == NULL);
764 *prevp = prev;
765 return req;
769 return NULL;
772 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
774 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
775 struct tcp_listen_opt *lopt = tp->listen_opt;
776 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
778 req->expires = jiffies + TCP_TIMEOUT_INIT;
779 req->retrans = 0;
780 req->sk = NULL;
781 req->index = h;
782 req->dl_next = lopt->syn_table[h];
784 write_lock(&tp->syn_wait_lock);
785 lopt->syn_table[h] = req;
786 write_unlock(&tp->syn_wait_lock);
788 tcp_synq_added(sk);
793 * This routine does path mtu discovery as defined in RFC1191.
795 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
797 struct dst_entry *dst;
798 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
800 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
801 * send out by Linux are always <576bytes so they should go through
802 * unfragmented).
804 if (sk->state == TCP_LISTEN)
805 return;
807 /* We don't check in the destentry if pmtu discovery is forbidden
808 * on this route. We just assume that no packet_to_big packets
809 * are send back when pmtu discovery is not active.
810 * There is a small race when the user changes this flag in the
811 * route, but I think that's acceptable.
813 if ((dst = __sk_dst_check(sk, 0)) == NULL)
814 return;
816 ip_rt_update_pmtu(dst, mtu);
818 /* Something is about to be wrong... Remember soft error
819 * for the case, if this connection will not able to recover.
821 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
822 sk->err_soft = EMSGSIZE;
824 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
825 tp->pmtu_cookie > dst->pmtu) {
826 tcp_sync_mss(sk, dst->pmtu);
828 /* Resend the TCP packet because it's
829 * clear that the old packet has been
830 * dropped. This is the new "fast" path mtu
831 * discovery.
833 tcp_simple_retransmit(sk);
834 } /* else let the usual retransmit timer handle it */
838 * This routine is called by the ICMP module when it gets some
839 * sort of error condition. If err < 0 then the socket should
840 * be closed and the error returned to the user. If err > 0
841 * it's just the icmp type << 8 | icmp code. After adjustment
842 * header points to the first 8 bytes of the tcp header. We need
843 * to find the appropriate port.
845 * The locking strategy used here is very "optimistic". When
846 * someone else accesses the socket the ICMP is just dropped
847 * and for some paths there is no check at all.
848 * A more general error queue to queue errors for later handling
849 * is probably better.
853 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
855 struct iphdr *iph = (struct iphdr*)dp;
856 struct tcphdr *th;
857 struct tcp_opt *tp;
858 int type = skb->h.icmph->type;
859 int code = skb->h.icmph->code;
860 #if ICMP_MIN_LENGTH < 14
861 int no_flags = 0;
862 #else
863 #define no_flags 0
864 #endif
865 struct sock *sk;
866 __u32 seq;
867 int err;
869 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
870 ICMP_INC_STATS_BH(IcmpInErrors);
871 return;
873 #if ICMP_MIN_LENGTH < 14
874 if (len < (iph->ihl << 2) + 14)
875 no_flags = 1;
876 #endif
878 th = (struct tcphdr*)(dp+(iph->ihl<<2));
880 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
881 if (sk == NULL) {
882 ICMP_INC_STATS_BH(IcmpInErrors);
883 return;
885 if (sk->state == TCP_TIME_WAIT) {
886 tcp_tw_put((struct tcp_tw_bucket*)sk);
887 return;
890 bh_lock_sock(sk);
891 /* If too many ICMPs get dropped on busy
892 * servers this needs to be solved differently.
894 if (sk->lock.users != 0)
895 NET_INC_STATS_BH(LockDroppedIcmps);
897 if (sk->state == TCP_CLOSE)
898 goto out;
900 tp = &sk->tp_pinfo.af_tcp;
901 seq = ntohl(th->seq);
902 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
903 NET_INC_STATS(OutOfWindowIcmps);
904 goto out;
907 switch (type) {
908 case ICMP_SOURCE_QUENCH:
909 /* This is deprecated, but if someone generated it,
910 * we have no reasons to ignore it.
912 if (sk->lock.users == 0)
913 tcp_enter_cwr(tp);
914 goto out;
915 case ICMP_PARAMETERPROB:
916 err = EPROTO;
917 break;
918 case ICMP_DEST_UNREACH:
919 if (code > NR_ICMP_UNREACH)
920 goto out;
922 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
923 if (sk->lock.users == 0)
924 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
925 goto out;
928 err = icmp_err_convert[code].errno;
929 break;
930 case ICMP_TIME_EXCEEDED:
931 err = EHOSTUNREACH;
932 break;
933 default:
934 goto out;
937 switch (sk->state) {
938 struct open_request *req, **prev;
939 case TCP_LISTEN:
940 if (sk->lock.users != 0)
941 goto out;
943 /* The final ACK of the handshake should be already
944 * handled in the new socket context, not here.
945 * Strictly speaking - an ICMP error for the final
946 * ACK should set the opening flag, but that is too
947 * complicated right now.
949 if (!no_flags && !th->syn && !th->ack)
950 goto out;
952 req = tcp_v4_search_req(tp, iph, th, &prev);
953 if (!req)
954 goto out;
956 /* ICMPs are not backlogged, hence we cannot get
957 an established socket here.
959 BUG_TRAP(req->sk == NULL);
961 if (seq != req->snt_isn) {
962 NET_INC_STATS_BH(OutOfWindowIcmps);
963 goto out;
967 * Still in SYN_RECV, just remove it silently.
968 * There is no good way to pass the error to the newly
969 * created socket, and POSIX does not want network
970 * errors returned from accept().
972 tcp_synq_drop(sk, req, prev);
973 goto out;
975 case TCP_SYN_SENT:
976 case TCP_SYN_RECV: /* Cannot happen.
977 It can f.e. if SYNs crossed.
979 if (!no_flags && !th->syn)
980 goto out;
981 if (sk->lock.users == 0) {
982 TCP_INC_STATS_BH(TcpAttemptFails);
983 sk->err = err;
985 sk->error_report(sk);
987 tcp_done(sk);
988 } else {
989 sk->err_soft = err;
991 goto out;
994 /* If we've already connected we will keep trying
995 * until we time out, or the user gives up.
997 * rfc1122 4.2.3.9 allows to consider as hard errors
998 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
999 * but it is obsoleted by pmtu discovery).
1001 * Note, that in modern internet, where routing is unreliable
1002 * and in each dark corner broken firewalls sit, sending random
1003 * errors ordered by their masters even this two messages finally lose
1004 * their original sense (even Linux sends invalid PORT_UNREACHs)
1006 * Now we are in compliance with RFCs.
1007 * --ANK (980905)
1010 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1011 sk->err = err;
1012 sk->error_report(sk);
1013 } else { /* Only an error on timeout */
1014 sk->err_soft = err;
1017 out:
1018 bh_unlock_sock(sk);
1019 sock_put(sk);
1022 /* This routine computes an IPv4 TCP checksum. */
1023 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1024 struct sk_buff *skb)
1026 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1027 csum_partial((char *)th, th->doff<<2, skb->csum));
1031 * This routine will send an RST to the other tcp.
1033 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1034 * for reset.
1035 * Answer: if a packet caused RST, it is not for a socket
1036 * existing in our system, if it is matched to a socket,
1037 * it is just duplicate segment or bug in other side's TCP.
1038 * So that we build reply only basing on parameters
1039 * arrived with segment.
1040 * Exception: precedence violation. We do not implement it in any case.
1043 static void tcp_v4_send_reset(struct sk_buff *skb)
1045 struct tcphdr *th = skb->h.th;
1046 struct tcphdr rth;
1047 struct ip_reply_arg arg;
1049 /* Never send a reset in response to a reset. */
1050 if (th->rst)
1051 return;
1053 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1054 return;
1056 /* Swap the send and the receive. */
1057 memset(&rth, 0, sizeof(struct tcphdr));
1058 rth.dest = th->source;
1059 rth.source = th->dest;
1060 rth.doff = sizeof(struct tcphdr)/4;
1061 rth.rst = 1;
1063 if (th->ack) {
1064 rth.seq = th->ack_seq;
1065 } else {
1066 rth.ack = 1;
1067 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1068 + skb->len - (th->doff<<2));
1071 memset(&arg, 0, sizeof arg);
1072 arg.iov[0].iov_base = (unsigned char *)&rth;
1073 arg.iov[0].iov_len = sizeof rth;
1074 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1075 skb->nh.iph->saddr, /*XXX*/
1076 sizeof(struct tcphdr),
1077 IPPROTO_TCP,
1078 0);
1079 arg.n_iov = 1;
1080 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1082 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1084 TCP_INC_STATS_BH(TcpOutSegs);
1085 TCP_INC_STATS_BH(TcpOutRsts);
1088 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1089 outside socket context is ugly, certainly. What can I do?
1092 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1094 struct tcphdr *th = skb->h.th;
1095 struct {
1096 struct tcphdr th;
1097 u32 tsopt[3];
1098 } rep;
1099 struct ip_reply_arg arg;
1101 memset(&rep.th, 0, sizeof(struct tcphdr));
1102 memset(&arg, 0, sizeof arg);
1104 arg.iov[0].iov_base = (unsigned char *)&rep;
1105 arg.iov[0].iov_len = sizeof(rep.th);
1106 arg.n_iov = 1;
1107 if (ts) {
1108 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1109 (TCPOPT_NOP << 16) |
1110 (TCPOPT_TIMESTAMP << 8) |
1111 TCPOLEN_TIMESTAMP);
1112 rep.tsopt[1] = htonl(tcp_time_stamp);
1113 rep.tsopt[2] = htonl(ts);
1114 arg.iov[0].iov_len = sizeof(rep);
1117 /* Swap the send and the receive. */
1118 rep.th.dest = th->source;
1119 rep.th.source = th->dest;
1120 rep.th.doff = arg.iov[0].iov_len/4;
1121 rep.th.seq = htonl(seq);
1122 rep.th.ack_seq = htonl(ack);
1123 rep.th.ack = 1;
1124 rep.th.window = htons(win);
1126 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1127 skb->nh.iph->saddr, /*XXX*/
1128 arg.iov[0].iov_len,
1129 IPPROTO_TCP,
1131 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1133 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1135 TCP_INC_STATS_BH(TcpOutSegs);
1138 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1140 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1142 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1143 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1145 tcp_tw_put(tw);
1148 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1150 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1151 req->ts_recent);
1154 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1156 struct rtable *rt;
1157 struct ip_options *opt;
1159 opt = req->af.v4_req.opt;
1160 if(ip_route_output(&rt, ((opt && opt->srr) ?
1161 opt->faddr :
1162 req->af.v4_req.rmt_addr),
1163 req->af.v4_req.loc_addr,
1164 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1165 sk->bound_dev_if)) {
1166 IP_INC_STATS_BH(IpOutNoRoutes);
1167 return NULL;
1169 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1170 ip_rt_put(rt);
1171 IP_INC_STATS_BH(IpOutNoRoutes);
1172 return NULL;
1174 return &rt->u.dst;
1178 * Send a SYN-ACK after having received an ACK.
1179 * This still operates on a open_request only, not on a big
1180 * socket.
1182 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1183 struct dst_entry *dst)
1185 int err = -1;
1186 struct sk_buff * skb;
1188 /* First, grab a route. */
1189 if (dst == NULL &&
1190 (dst = tcp_v4_route_req(sk, req)) == NULL)
1191 goto out;
1193 skb = tcp_make_synack(sk, dst, req);
1195 if (skb) {
1196 struct tcphdr *th = skb->h.th;
1198 th->check = tcp_v4_check(th, skb->len,
1199 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1200 csum_partial((char *)th, skb->len, skb->csum));
1202 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1203 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1204 if (err == NET_XMIT_CN)
1205 err = 0;
1208 out:
1209 dst_release(dst);
1210 return err;
1214 * IPv4 open_request destructor.
1216 static void tcp_v4_or_free(struct open_request *req)
1218 if (req->af.v4_req.opt)
1219 kfree(req->af.v4_req.opt);
1222 static inline void syn_flood_warning(struct sk_buff *skb)
1224 static unsigned long warntime;
1226 if (jiffies - warntime > HZ*60) {
1227 warntime = jiffies;
1228 printk(KERN_INFO
1229 "possible SYN flooding on port %d. Sending cookies.\n",
1230 ntohs(skb->h.th->dest));
1235 * Save and compile IPv4 options into the open_request if needed.
1237 static inline struct ip_options *
1238 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1240 struct ip_options *opt = &(IPCB(skb)->opt);
1241 struct ip_options *dopt = NULL;
1243 if (opt && opt->optlen) {
1244 int opt_size = optlength(opt);
1245 dopt = kmalloc(opt_size, GFP_ATOMIC);
1246 if (dopt) {
1247 if (ip_options_echo(dopt, skb)) {
1248 kfree(dopt);
1249 dopt = NULL;
1253 return dopt;
1257 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1258 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1259 * It would be better to replace it with a global counter for all sockets
1260 * but then some measure against one socket starving all other sockets
1261 * would be needed.
1263 * It was 128 by default. Experiments with real servers show, that
1264 * it is absolutely not enough even at 100conn/sec. 256 cures most
1265 * of problems. This value is adjusted to 128 for very small machines
1266 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1267 * Further increasing requires to change hash table size.
1269 int sysctl_max_syn_backlog = 256;
1271 struct or_calltable or_ipv4 = {
1272 PF_INET,
1273 tcp_v4_send_synack,
1274 tcp_v4_or_send_ack,
1275 tcp_v4_or_free,
1276 tcp_v4_send_reset
1279 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1281 struct tcp_opt tp;
1282 struct open_request *req;
1283 __u32 saddr = skb->nh.iph->saddr;
1284 __u32 daddr = skb->nh.iph->daddr;
1285 __u32 isn = TCP_SKB_CB(skb)->when;
1286 struct dst_entry *dst = NULL;
1287 #ifdef CONFIG_SYN_COOKIES
1288 int want_cookie = 0;
1289 #else
1290 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1291 #endif
1293 /* Never answer to SYNs send to broadcast or multicast */
1294 if (((struct rtable *)skb->dst)->rt_flags &
1295 (RTCF_BROADCAST|RTCF_MULTICAST))
1296 goto drop;
1298 /* TW buckets are converted to open requests without
1299 * limitations, they conserve resources and peer is
1300 * evidently real one.
1302 if (tcp_synq_is_full(sk) && !isn) {
1303 #ifdef CONFIG_SYN_COOKIES
1304 if (sysctl_tcp_syncookies) {
1305 want_cookie = 1;
1306 } else
1307 #endif
1308 goto drop;
1311 /* Accept backlog is full. If we have already queued enough
1312 * of warm entries in syn queue, drop request. It is better than
1313 * clogging syn queue with openreqs with exponentially increasing
1314 * timeout.
1316 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1317 goto drop;
1319 req = tcp_openreq_alloc();
1320 if (req == NULL)
1321 goto drop;
1323 tcp_clear_options(&tp);
1324 tp.mss_clamp = 536;
1325 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1327 tcp_parse_options(skb, &tp, 0);
1329 if (want_cookie) {
1330 tcp_clear_options(&tp);
1331 tp.saw_tstamp = 0;
1334 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1335 /* Some OSes (unknown ones, but I see them on web server, which
1336 * contains information interesting only for windows'
1337 * users) do not send their stamp in SYN. It is easy case.
1338 * We simply do not advertise TS support.
1340 tp.saw_tstamp = 0;
1341 tp.tstamp_ok = 0;
1343 tp.tstamp_ok = tp.saw_tstamp;
1345 tcp_openreq_init(req, &tp, skb);
1347 req->af.v4_req.loc_addr = daddr;
1348 req->af.v4_req.rmt_addr = saddr;
1349 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1350 req->class = &or_ipv4;
1351 if (!want_cookie)
1352 TCP_ECN_create_request(req, skb->h.th);
1354 if (want_cookie) {
1355 #ifdef CONFIG_SYN_COOKIES
1356 syn_flood_warning(skb);
1357 #endif
1358 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1359 } else if (isn == 0) {
1360 struct inet_peer *peer = NULL;
1362 /* VJ's idea. We save last timestamp seen
1363 * from the destination in peer table, when entering
1364 * state TIME-WAIT, and check against it before
1365 * accepting new connection request.
1367 * If "isn" is not zero, this request hit alive
1368 * timewait bucket, so that all the necessary checks
1369 * are made in the function processing timewait state.
1371 if (tp.saw_tstamp &&
1372 sysctl_tcp_tw_recycle &&
1373 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1374 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1375 peer->v4daddr == saddr) {
1376 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1377 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1378 NET_INC_STATS_BH(PAWSPassiveRejected);
1379 dst_release(dst);
1380 goto drop_and_free;
1383 /* Kill the following clause, if you dislike this way. */
1384 else if (!sysctl_tcp_syncookies &&
1385 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1386 < (sysctl_max_syn_backlog>>2)) &&
1387 (!peer || !peer->tcp_ts_stamp) &&
1388 (!dst || !dst->rtt)) {
1389 /* Without syncookies last quarter of
1390 * backlog is filled with destinations, proven to be alive.
1391 * It means that we continue to communicate
1392 * to destinations, already remembered
1393 * to the moment of synflood.
1395 NETDEBUG(if (net_ratelimit()) \
1396 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1397 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1398 TCP_INC_STATS_BH(TcpAttemptFails);
1399 dst_release(dst);
1400 goto drop_and_free;
1403 isn = tcp_v4_init_sequence(sk, skb);
1405 req->snt_isn = isn;
1407 if (tcp_v4_send_synack(sk, req, dst))
1408 goto drop_and_free;
1410 if (want_cookie) {
1411 tcp_openreq_free(req);
1412 } else {
1413 tcp_v4_synq_add(sk, req);
1415 return 0;
1417 drop_and_free:
1418 tcp_openreq_free(req);
1419 drop:
1420 TCP_INC_STATS_BH(TcpAttemptFails);
1421 return 0;
1426 * The three way handshake has completed - we got a valid synack -
1427 * now create the new socket.
1429 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1430 struct open_request *req,
1431 struct dst_entry *dst)
1433 struct tcp_opt *newtp;
1434 struct sock *newsk;
1436 if (tcp_acceptq_is_full(sk))
1437 goto exit_overflow;
1439 if (dst == NULL &&
1440 (dst = tcp_v4_route_req(sk, req)) == NULL)
1441 goto exit;
1443 newsk = tcp_create_openreq_child(sk, req, skb);
1444 if (!newsk)
1445 goto exit;
1447 newsk->dst_cache = dst;
1449 newtp = &(newsk->tp_pinfo.af_tcp);
1450 newsk->daddr = req->af.v4_req.rmt_addr;
1451 newsk->saddr = req->af.v4_req.loc_addr;
1452 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1453 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1454 req->af.v4_req.opt = NULL;
1455 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1456 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1457 newtp->ext_header_len = 0;
1458 if (newsk->protinfo.af_inet.opt)
1459 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1461 tcp_sync_mss(newsk, dst->pmtu);
1462 newtp->advmss = dst->advmss;
1463 tcp_initialize_rcv_mss(newsk);
1465 __tcp_v4_hash(newsk);
1466 __tcp_inherit_port(sk, newsk);
1468 return newsk;
1470 exit_overflow:
1471 NET_INC_STATS_BH(ListenOverflows);
1472 exit:
1473 NET_INC_STATS_BH(ListenDrops);
1474 dst_release(dst);
1475 return NULL;
1478 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1480 struct open_request *req, **prev;
1481 struct tcphdr *th = skb->h.th;
1482 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1483 struct sock *nsk;
1485 /* Find possible connection requests. */
1486 req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1487 if (req)
1488 return tcp_check_req(sk, skb, req, prev);
1490 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1491 th->source,
1492 skb->nh.iph->daddr,
1493 ntohs(th->dest),
1494 tcp_v4_iif(skb));
1496 if (nsk) {
1497 if (nsk->state != TCP_TIME_WAIT) {
1498 bh_lock_sock(nsk);
1499 return nsk;
1501 tcp_tw_put((struct tcp_tw_bucket*)sk);
1502 return NULL;
1505 #ifdef CONFIG_SYN_COOKIES
1506 if (!th->rst && !th->syn && th->ack)
1507 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1508 #endif
1509 return sk;
1512 static int tcp_v4_checksum_init(struct sk_buff *skb)
1514 if (skb->ip_summed == CHECKSUM_HW) {
1515 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1516 skb->nh.iph->daddr,skb->csum)) {
1517 NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1518 return -1;
1520 skb->ip_summed = CHECKSUM_UNNECESSARY;
1521 } else {
1522 if (skb->len <= 76) {
1523 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1524 skb->nh.iph->daddr,
1525 csum_partial((char *)skb->h.th, skb->len, 0)))
1526 return -1;
1527 skb->ip_summed = CHECKSUM_UNNECESSARY;
1528 } else {
1529 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1530 skb->nh.iph->daddr,0);
1533 return 0;
1537 /* The socket must have it's spinlock held when we get
1538 * here.
1540 * We have a potential double-lock case here, so even when
1541 * doing backlog processing we use the BH locking scheme.
1542 * This is because we cannot sleep with the original spinlock
1543 * held.
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 #ifdef CONFIG_FILTER
1548 struct sk_filter *filter = sk->filter;
1549 if (filter && sk_filter(skb, filter))
1550 goto discard;
1551 #endif /* CONFIG_FILTER */
1553 IP_INC_STATS_BH(IpInDelivers);
1555 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1556 TCP_CHECK_TIMER(sk);
1557 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1558 goto reset;
1559 TCP_CHECK_TIMER(sk);
1560 return 0;
1563 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1564 goto csum_err;
1566 if (sk->state == TCP_LISTEN) {
1567 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1568 if (!nsk)
1569 goto discard;
1571 if (nsk != sk) {
1572 if (tcp_child_process(sk, nsk, skb))
1573 goto reset;
1574 return 0;
1578 TCP_CHECK_TIMER(sk);
1579 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1580 goto reset;
1581 TCP_CHECK_TIMER(sk);
1582 return 0;
1584 reset:
1585 tcp_v4_send_reset(skb);
1586 discard:
1587 kfree_skb(skb);
1588 /* Be careful here. If this function gets more complicated and
1589 * gcc suffers from register pressure on the x86, sk (in %ebx)
1590 * might be destroyed here. This current version compiles correctly,
1591 * but you have been warned.
1593 return 0;
1595 csum_err:
1596 TCP_INC_STATS_BH(TcpInErrs);
1597 goto discard;
1601 * From tcp_input.c
1604 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1606 struct tcphdr *th;
1607 struct sock *sk;
1608 int ret;
1610 if (skb->pkt_type!=PACKET_HOST)
1611 goto discard_it;
1613 th = skb->h.th;
1615 /* Pull up the IP header. */
1616 __skb_pull(skb, skb->h.raw - skb->data);
1618 /* Count it even if it's bad */
1619 TCP_INC_STATS_BH(TcpInSegs);
1621 /* An explanation is required here, I think.
1622 * Packet length and doff are validated by header prediction,
1623 * provided case of th->doff==0 is elimineted.
1624 * So, we defer the checks. */
1625 if (th->doff < sizeof(struct tcphdr)/4 ||
1626 (skb->ip_summed != CHECKSUM_UNNECESSARY &&
1627 tcp_v4_checksum_init(skb) < 0))
1628 goto bad_packet;
1630 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1631 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1632 len - th->doff*4);
1633 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1634 TCP_SKB_CB(skb)->when = 0;
1635 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1636 TCP_SKB_CB(skb)->sacked = 0;
1637 skb->used = 0;
1639 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1640 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1642 if (!sk)
1643 goto no_tcp_socket;
1645 process:
1646 if(!ipsec_sk_policy(sk,skb))
1647 goto discard_and_relse;
1649 if (sk->state == TCP_TIME_WAIT)
1650 goto do_time_wait;
1652 skb->dev = NULL;
1654 bh_lock_sock(sk);
1655 ret = 0;
1656 if (!sk->lock.users) {
1657 if (!tcp_prequeue(sk, skb))
1658 ret = tcp_v4_do_rcv(sk, skb);
1659 } else
1660 sk_add_backlog(sk, skb);
1661 bh_unlock_sock(sk);
1663 sock_put(sk);
1665 return ret;
1667 no_tcp_socket:
1668 if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1669 bad_packet:
1670 TCP_INC_STATS_BH(TcpInErrs);
1671 } else {
1672 tcp_v4_send_reset(skb);
1675 discard_it:
1676 /* Discard frame. */
1677 kfree_skb(skb);
1678 return 0;
1680 discard_and_relse:
1681 sock_put(sk);
1682 goto discard_it;
1684 do_time_wait:
1685 if (len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1686 TCP_INC_STATS_BH(TcpInErrs);
1687 goto discard_and_relse;
1689 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1690 skb, th, skb->len)) {
1691 case TCP_TW_SYN:
1693 struct sock *sk2;
1695 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1696 if (sk2 != NULL) {
1697 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1698 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1699 tcp_tw_put((struct tcp_tw_bucket *)sk);
1700 sk = sk2;
1701 goto process;
1703 /* Fall through to ACK */
1705 case TCP_TW_ACK:
1706 tcp_v4_timewait_ack(sk, skb);
1707 break;
1708 case TCP_TW_RST:
1709 goto no_tcp_socket;
1710 case TCP_TW_SUCCESS:
1712 goto discard_it;
1715 /* With per-bucket locks this operation is not-atomic, so that
1716 * this version is not worse.
1718 static void __tcp_v4_rehash(struct sock *sk)
1720 sk->prot->unhash(sk);
1721 sk->prot->hash(sk);
1724 static int tcp_v4_reselect_saddr(struct sock *sk)
1726 int err;
1727 struct rtable *rt;
1728 __u32 old_saddr = sk->saddr;
1729 __u32 new_saddr;
1730 __u32 daddr = sk->daddr;
1732 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1733 daddr = sk->protinfo.af_inet.opt->faddr;
1735 /* Query new route. */
1736 err = ip_route_connect(&rt, daddr, 0,
1737 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1738 sk->bound_dev_if);
1739 if (err)
1740 return err;
1742 __sk_dst_set(sk, &rt->u.dst);
1743 /* sk->route_caps = rt->u.dst.dev->features; */
1745 new_saddr = rt->rt_src;
1747 if (new_saddr == old_saddr)
1748 return 0;
1750 if (sysctl_ip_dynaddr > 1) {
1751 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1752 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1753 NIPQUAD(old_saddr),
1754 NIPQUAD(new_saddr));
1757 sk->saddr = new_saddr;
1758 sk->rcv_saddr = new_saddr;
1760 /* XXX The only one ugly spot where we need to
1761 * XXX really change the sockets identity after
1762 * XXX it has entered the hashes. -DaveM
1764 * Besides that, it does not check for connection
1765 * uniqueness. Wait for troubles.
1767 __tcp_v4_rehash(sk);
1768 return 0;
1771 int tcp_v4_rebuild_header(struct sock *sk)
1773 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1774 u32 daddr;
1775 int err;
1777 /* Route is OK, nothing to do. */
1778 if (rt != NULL)
1779 return 0;
1781 /* Reroute. */
1782 daddr = sk->daddr;
1783 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1784 daddr = sk->protinfo.af_inet.opt->faddr;
1786 err = ip_route_output(&rt, daddr, sk->saddr,
1787 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1788 sk->bound_dev_if);
1789 if (!err) {
1790 __sk_dst_set(sk, &rt->u.dst);
1791 /* sk->route_caps = rt->u.dst.dev->features; */
1792 return 0;
1795 /* Routing failed... */
1796 /* sk->route_caps = 0; */
1798 if (!sysctl_ip_dynaddr ||
1799 sk->state != TCP_SYN_SENT ||
1800 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1801 (err = tcp_v4_reselect_saddr(sk)) != 0) {
1802 sk->err_soft=-err;
1803 /* sk->error_report(sk); */
1805 return err;
1808 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1810 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1812 sin->sin_family = AF_INET;
1813 sin->sin_addr.s_addr = sk->daddr;
1814 sin->sin_port = sk->dport;
1817 /* VJ's idea. Save last timestamp seen from this destination
1818 * and hold it at least for normal timewait interval to use for duplicate
1819 * segment detection in subsequent connections, before they enter synchronized
1820 * state.
1823 int tcp_v4_remember_stamp(struct sock *sk)
1825 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1826 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1827 struct inet_peer *peer = NULL;
1828 int release_it = 0;
1830 if (rt == NULL || rt->rt_dst != sk->daddr) {
1831 peer = inet_getpeer(sk->daddr, 1);
1832 release_it = 1;
1833 } else {
1834 if (rt->peer == NULL)
1835 rt_bind_peer(rt, 1);
1836 peer = rt->peer;
1839 if (peer) {
1840 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1841 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1842 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1843 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1844 peer->tcp_ts = tp->ts_recent;
1846 if (release_it)
1847 inet_putpeer(peer);
1848 return 1;
1851 return 0;
1854 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1856 struct inet_peer *peer = NULL;
1858 peer = inet_getpeer(tw->daddr, 1);
1860 if (peer) {
1861 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1862 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1863 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1864 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1865 peer->tcp_ts = tw->ts_recent;
1867 inet_putpeer(peer);
1868 return 1;
1871 return 0;
1874 struct tcp_func ipv4_specific = {
1875 ip_queue_xmit,
1876 tcp_v4_send_check,
1877 tcp_v4_rebuild_header,
1878 tcp_v4_conn_request,
1879 tcp_v4_syn_recv_sock,
1880 tcp_v4_hash_connecting,
1881 tcp_v4_remember_stamp,
1882 sizeof(struct iphdr),
1884 ip_setsockopt,
1885 ip_getsockopt,
1886 v4_addr2sockaddr,
1887 sizeof(struct sockaddr_in)
1890 /* NOTE: A lot of things set to zero explicitly by call to
1891 * sk_alloc() so need not be done here.
1893 static int tcp_v4_init_sock(struct sock *sk)
1895 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1897 skb_queue_head_init(&tp->out_of_order_queue);
1898 tcp_init_xmit_timers(sk);
1899 tcp_prequeue_init(tp);
1901 tp->rto = TCP_TIMEOUT_INIT;
1902 tp->mdev = TCP_TIMEOUT_INIT;
1904 /* So many TCP implementations out there (incorrectly) count the
1905 * initial SYN frame in their delayed-ACK and congestion control
1906 * algorithms that we must have the following bandaid to talk
1907 * efficiently to them. -DaveM
1909 tp->snd_cwnd = 2;
1911 /* See draft-stevens-tcpca-spec-01 for discussion of the
1912 * initialization of these values.
1914 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1915 tp->snd_cwnd_clamp = ~0;
1916 tp->mss_cache = 536;
1918 tp->reordering = sysctl_tcp_reordering;
1920 sk->state = TCP_CLOSE;
1922 sk->write_space = tcp_write_space;
1924 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1926 sk->sndbuf = sysctl_tcp_wmem[1];
1927 sk->rcvbuf = sysctl_tcp_rmem[1];
1929 atomic_inc(&tcp_sockets_allocated);
1931 return 0;
1934 static int tcp_v4_destroy_sock(struct sock *sk)
1936 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1938 tcp_clear_xmit_timers(sk);
1940 /* Cleanup up the write buffer. */
1941 tcp_writequeue_purge(sk);
1943 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1944 __skb_queue_purge(&tp->out_of_order_queue);
1946 /* Clean prequeue, it must be empty really */
1947 __skb_queue_purge(&tp->ucopy.prequeue);
1949 /* Clean up a referenced TCP bind bucket. */
1950 if(sk->prev != NULL)
1951 tcp_put_port(sk);
1953 atomic_dec(&tcp_sockets_allocated);
1955 return 0;
1958 /* Proc filesystem TCP sock list dumping. */
1959 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1961 int ttd = req->expires - jiffies;
1963 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1964 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1966 req->af.v4_req.loc_addr,
1967 ntohs(sk->sport),
1968 req->af.v4_req.rmt_addr,
1969 ntohs(req->rmt_port),
1970 TCP_SYN_RECV,
1971 0,0, /* could print option size, but that is af dependent. */
1972 1, /* timers active (only the expire timer) */
1973 ttd,
1974 req->retrans,
1975 uid,
1976 0, /* non standard timer */
1977 0, /* open_requests have no inode */
1978 atomic_read(&sk->refcnt),
1983 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1985 unsigned int dest, src;
1986 __u16 destp, srcp;
1987 int timer_active;
1988 unsigned long timer_expires;
1989 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1991 dest = sp->daddr;
1992 src = sp->rcv_saddr;
1993 destp = ntohs(sp->dport);
1994 srcp = ntohs(sp->sport);
1995 if (tp->pending == TCP_TIME_RETRANS) {
1996 timer_active = 1;
1997 timer_expires = tp->timeout;
1998 } else if (tp->pending == TCP_TIME_PROBE0) {
1999 timer_active = 4;
2000 timer_expires = tp->timeout;
2001 } else if (timer_pending(&sp->timer)) {
2002 timer_active = 2;
2003 timer_expires = sp->timer.expires;
2004 } else {
2005 timer_active = 0;
2006 timer_expires = jiffies;
2009 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2010 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2011 i, src, srcp, dest, destp, sp->state,
2012 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2013 timer_active, timer_expires-jiffies,
2014 tp->retransmits,
2015 sock_i_uid(sp),
2016 tp->probes_out,
2017 sock_i_ino(sp),
2018 atomic_read(&sp->refcnt), sp,
2019 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2020 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2024 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2026 unsigned int dest, src;
2027 __u16 destp, srcp;
2028 int ttd = tw->ttd - jiffies;
2030 if (ttd < 0)
2031 ttd = 0;
2033 dest = tw->daddr;
2034 src = tw->rcv_saddr;
2035 destp = ntohs(tw->dport);
2036 srcp = ntohs(tw->sport);
2038 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2039 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2040 i, src, srcp, dest, destp, tw->substate, 0, 0,
2041 3, ttd, 0, 0, 0, 0,
2042 atomic_read(&tw->refcnt), tw);
2045 #define TMPSZ 150
2047 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2049 int len = 0, num = 0, i;
2050 off_t begin, pos = 0;
2051 char tmpbuf[TMPSZ+1];
2053 if (offset < TMPSZ)
2054 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2055 " sl local_address rem_address st tx_queue "
2056 "rx_queue tr tm->when retrnsmt uid timeout inode");
2058 pos = TMPSZ;
2060 /* First, walk listening socket table. */
2061 tcp_listen_lock();
2062 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2063 struct sock *sk = tcp_listening_hash[i];
2064 struct tcp_listen_opt *lopt;
2065 int k;
2067 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2068 struct open_request *req;
2069 int uid;
2070 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2072 if (!TCP_INET_FAMILY(sk->family))
2073 goto skip_listen;
2075 pos += TMPSZ;
2076 if (pos >= offset) {
2077 get_tcp_sock(sk, tmpbuf, num);
2078 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2079 if (len >= length) {
2080 tcp_listen_unlock();
2081 goto out_no_bh;
2085 skip_listen:
2086 uid = sock_i_uid(sk);
2087 read_lock_bh(&tp->syn_wait_lock);
2088 lopt = tp->listen_opt;
2089 if (lopt && lopt->qlen != 0) {
2090 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2091 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2092 if (!TCP_INET_FAMILY(req->class->family))
2093 continue;
2095 pos += TMPSZ;
2096 if (pos <= offset)
2097 continue;
2098 get_openreq(sk, req, tmpbuf, num, uid);
2099 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2100 if(len >= length) {
2101 read_unlock_bh(&tp->syn_wait_lock);
2102 tcp_listen_unlock();
2103 goto out_no_bh;
2108 read_unlock_bh(&tp->syn_wait_lock);
2110 /* Completed requests are in normal socket hash table */
2113 tcp_listen_unlock();
2115 local_bh_disable();
2117 /* Next, walk established hash chain. */
2118 for (i = 0; i < tcp_ehash_size; i++) {
2119 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2120 struct sock *sk;
2121 struct tcp_tw_bucket *tw;
2123 read_lock(&head->lock);
2124 for(sk = head->chain; sk; sk = sk->next, num++) {
2125 if (!TCP_INET_FAMILY(sk->family))
2126 continue;
2127 pos += TMPSZ;
2128 if (pos <= offset)
2129 continue;
2130 get_tcp_sock(sk, tmpbuf, num);
2131 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2132 if(len >= length) {
2133 read_unlock(&head->lock);
2134 goto out;
2137 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2138 tw != NULL;
2139 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2140 if (!TCP_INET_FAMILY(tw->family))
2141 continue;
2142 pos += TMPSZ;
2143 if (pos <= offset)
2144 continue;
2145 get_timewait_sock(tw, tmpbuf, num);
2146 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2147 if(len >= length) {
2148 read_unlock(&head->lock);
2149 goto out;
2152 read_unlock(&head->lock);
2155 out:
2156 local_bh_enable();
2157 out_no_bh:
2159 begin = len - (pos - offset);
2160 *start = buffer + begin;
2161 len -= begin;
2162 if(len > length)
2163 len = length;
2164 if (len < 0)
2165 len = 0;
2166 return len;
2169 struct proto tcp_prot = {
2170 name: "TCP",
2171 close: tcp_close,
2172 connect: tcp_v4_connect,
2173 disconnect: tcp_disconnect,
2174 accept: tcp_accept,
2175 ioctl: tcp_ioctl,
2176 init: tcp_v4_init_sock,
2177 destroy: tcp_v4_destroy_sock,
2178 shutdown: tcp_shutdown,
2179 setsockopt: tcp_setsockopt,
2180 getsockopt: tcp_getsockopt,
2181 sendmsg: tcp_sendmsg,
2182 recvmsg: tcp_recvmsg,
2183 backlog_rcv: tcp_v4_do_rcv,
2184 hash: tcp_v4_hash,
2185 unhash: tcp_unhash,
2186 get_port: tcp_v4_get_port,
2191 void __init tcp_v4_init(struct net_proto_family *ops)
2193 int err;
2195 tcp_inode.i_mode = S_IFSOCK;
2196 tcp_inode.i_sock = 1;
2197 tcp_inode.i_uid = 0;
2198 tcp_inode.i_gid = 0;
2199 init_waitqueue_head(&tcp_inode.i_wait);
2200 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2202 tcp_socket->inode = &tcp_inode;
2203 tcp_socket->state = SS_UNCONNECTED;
2204 tcp_socket->type=SOCK_RAW;
2206 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2207 panic("Failed to create the TCP control socket.\n");
2208 tcp_socket->sk->allocation=GFP_ATOMIC;
2209 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2211 /* Unhash it so that IP input processing does not even
2212 * see it, we do not wish this socket to see incoming
2213 * packets.
2215 tcp_socket->sk->prot->unhash(tcp_socket->sk);