Linux 2.2.0
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blob660e64c44ffdb1a968d57891172cae8f6a4fbf35
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.164 1999/01/04 20:36:55 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
55 #include <linux/ipsec.h>
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
61 #include <asm/segment.h>
63 #include <linux/inet.h>
64 #include <linux/stddef.h>
66 extern int sysctl_tcp_timestamps;
67 extern int sysctl_tcp_window_scaling;
68 extern int sysctl_tcp_sack;
69 extern int sysctl_tcp_syncookies;
70 extern int sysctl_ip_dynaddr;
71 extern __u32 sysctl_wmem_max;
72 extern __u32 sysctl_rmem_max;
74 /* Check TCP sequence numbers in ICMP packets. */
75 #define ICMP_MIN_LENGTH 8
77 /* Socket used for sending RSTs */
78 struct inode tcp_inode;
79 struct socket *tcp_socket=&tcp_inode.u.socket_i;
81 static void tcp_v4_send_reset(struct sk_buff *skb);
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
84 struct sk_buff *skb);
86 /* This is for sockets with full identity only. Sockets here will always
87 * be without wildcards and will have the following invariant:
88 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
90 * First half of the table is for sockets not in TIME_WAIT, second half
91 * is for TIME_WAIT sockets only.
93 struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
95 /* Ok, let's try this, I give up, we do need a local binding
96 * TCP hash as well as the others for fast bind/connect.
98 struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
100 /* All sockets in TCP_LISTEN state will be in here. This is the only table
101 * where wildcard'd TCP sockets can exist. Hash function here is just local
102 * port number.
104 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
106 /* Register cache. */
107 struct sock *tcp_regs[TCP_NUM_REGS];
110 * This array holds the first and last local port number.
111 * For high-usage systems, use sysctl to change this to
112 * 32768-61000
114 int sysctl_local_port_range[2] = { 1024, 4999 };
115 int tcp_port_rover = (1024 - 1);
117 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
118 __u32 faddr, __u16 fport)
120 return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
123 static __inline__ int tcp_sk_hashfn(struct sock *sk)
125 __u32 laddr = sk->rcv_saddr;
126 __u16 lport = sk->num;
127 __u32 faddr = sk->daddr;
128 __u16 fport = sk->dport;
130 return tcp_hashfn(laddr, lport, faddr, fport);
133 /* Invariant, sk->num is non-zero. */
134 void tcp_bucket_unlock(struct sock *sk)
136 struct tcp_bind_bucket *tb;
137 unsigned short snum = sk->num;
139 SOCKHASH_LOCK();
140 for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
141 if(tb->port == snum) {
142 if(tb->owners == NULL &&
143 (tb->flags & TCPB_FLAG_LOCKED)) {
144 tb->flags &= ~(TCPB_FLAG_LOCKED |
145 TCPB_FLAG_FASTREUSE);
146 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
148 break;
151 SOCKHASH_UNLOCK();
154 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
156 struct tcp_bind_bucket *tb;
158 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
159 if(tb != NULL) {
160 struct tcp_bind_bucket **head =
161 &tcp_bound_hash[tcp_bhashfn(snum)];
162 tb->port = snum;
163 tb->flags = TCPB_FLAG_LOCKED;
164 tb->owners = NULL;
165 if((tb->next = *head) != NULL)
166 tb->next->pprev = &tb->next;
167 *head = tb;
168 tb->pprev = head;
170 return tb;
173 #ifdef CONFIG_IP_TRANSPARENT_PROXY
174 /* Ensure that the bound bucket for the port exists.
175 * Return 0 on success.
177 static __inline__ int tcp_bucket_check(unsigned short snum)
179 struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
180 for( ; (tb && (tb->port != snum)); tb = tb->next)
182 if(tb == NULL && tcp_bucket_create(snum) == NULL)
183 return 1;
184 else
185 return 0;
187 #endif
189 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
191 struct tcp_bind_bucket *tb;
192 int result = 0;
194 SOCKHASH_LOCK();
195 for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
196 (tb && (tb->port != snum));
197 tb = tb->next)
199 if(tb && tb->owners) {
200 /* Fast path for reuse ports, see include/net/tcp.h for a very
201 * detailed description of why this works, and why it is worth
202 * the effort at all. -DaveM
204 if((tb->flags & TCPB_FLAG_FASTREUSE) &&
205 (sk->reuse != 0)) {
206 goto go_like_smoke;
207 } else {
208 struct sock *sk2;
209 int sk_reuse = sk->reuse;
211 /* We must walk the whole port owner list in this case. -DaveM */
212 for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
213 if (sk->bound_dev_if == sk2->bound_dev_if) {
214 if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
215 if(!sk2->rcv_saddr ||
216 !sk->rcv_saddr ||
217 (sk2->rcv_saddr == sk->rcv_saddr))
218 break;
222 if(sk2 != NULL)
223 result = 1;
226 if(result == 0) {
227 if(tb == NULL) {
228 if((tb = tcp_bucket_create(snum)) == NULL)
229 result = 1;
230 else if (sk->reuse && sk->state != TCP_LISTEN)
231 tb->flags |= TCPB_FLAG_FASTREUSE;
232 } else {
233 /* It could be pending garbage collection, this
234 * kills the race and prevents it from disappearing
235 * out from under us by the time we use it. -DaveM
237 if(tb->owners == NULL) {
238 if (!(tb->flags & TCPB_FLAG_LOCKED)) {
239 tb->flags = (TCPB_FLAG_LOCKED |
240 ((sk->reuse &&
241 sk->state != TCP_LISTEN) ?
242 TCPB_FLAG_FASTREUSE : 0));
243 tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
244 } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
245 /* Someone is in between the bind
246 * and the actual connect or listen.
247 * See if it was a legitimate reuse
248 * and we are as well, else punt.
250 if (sk->reuse == 0 ||
251 !(tb->flags & TCPB_FLAG_FASTREUSE))
252 result = 1;
253 } else
254 tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
258 go_like_smoke:
259 SOCKHASH_UNLOCK();
260 return result;
263 unsigned short tcp_good_socknum(void)
265 struct tcp_bind_bucket *tb;
266 int low = sysctl_local_port_range[0];
267 int high = sysctl_local_port_range[1];
268 int remaining = (high - low) + 1;
269 int rover;
271 SOCKHASH_LOCK();
272 rover = tcp_port_rover;
273 do {
274 rover += 1;
275 if((rover < low) || (rover > high))
276 rover = low;
277 tb = tcp_bound_hash[tcp_bhashfn(rover)];
278 for( ; tb; tb = tb->next) {
279 if(tb->port == rover)
280 goto next;
282 break;
283 next:
284 } while(--remaining > 0);
285 tcp_port_rover = rover;
286 tb = NULL;
287 if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
288 rover = 0;
289 if (tb != NULL)
290 tb->flags |= TCPB_FLAG_GOODSOCKNUM;
291 SOCKHASH_UNLOCK();
293 return rover;
296 static void tcp_v4_hash(struct sock *sk)
298 if (sk->state != TCP_CLOSE) {
299 struct sock **skp;
301 SOCKHASH_LOCK();
302 skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
303 if((sk->next = *skp) != NULL)
304 (*skp)->pprev = &sk->next;
305 *skp = sk;
306 sk->pprev = skp;
307 tcp_sk_bindify(sk);
308 SOCKHASH_UNLOCK();
312 static void tcp_v4_unhash(struct sock *sk)
314 SOCKHASH_LOCK();
315 if(sk->pprev) {
316 if(sk->next)
317 sk->next->pprev = sk->pprev;
318 *sk->pprev = sk->next;
319 sk->pprev = NULL;
320 tcp_reg_zap(sk);
321 tcp_sk_unbindify(sk);
323 SOCKHASH_UNLOCK();
326 static void tcp_v4_rehash(struct sock *sk)
328 unsigned char state;
330 SOCKHASH_LOCK();
331 state = sk->state;
332 if(sk->pprev != NULL) {
333 if(sk->next)
334 sk->next->pprev = sk->pprev;
335 *sk->pprev = sk->next;
336 sk->pprev = NULL;
337 tcp_reg_zap(sk);
339 if(state != TCP_CLOSE) {
340 struct sock **skp;
342 if(state == TCP_LISTEN)
343 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
344 else
345 skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
347 if((sk->next = *skp) != NULL)
348 (*skp)->pprev = &sk->next;
349 *skp = sk;
350 sk->pprev = skp;
351 if(state == TCP_LISTEN)
352 tcp_sk_bindify(sk);
354 SOCKHASH_UNLOCK();
357 /* Don't inline this cruft. Here are some nice properties to
358 * exploit here. The BSD API does not allow a listening TCP
359 * to specify the remote port nor the remote address for the
360 * connection. So always assume those are both wildcarded
361 * during the search since they can never be otherwise.
363 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
365 struct sock *sk;
366 struct sock *result = NULL;
367 int score, hiscore;
369 hiscore=0;
370 for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
371 if(sk->num == hnum) {
372 __u32 rcv_saddr = sk->rcv_saddr;
374 score = 1;
375 if(rcv_saddr) {
376 if (rcv_saddr != daddr)
377 continue;
378 score++;
380 if (sk->bound_dev_if) {
381 if (sk->bound_dev_if != dif)
382 continue;
383 score++;
385 if (score == 3)
386 return sk;
387 if (score > hiscore) {
388 hiscore = score;
389 result = sk;
393 return result;
396 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
397 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
398 * It is assumed that this code only gets called from within NET_BH.
400 static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
401 u32 saddr, u16 sport,
402 u32 daddr, u16 dport, int dif)
404 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
405 __u16 hnum = ntohs(dport);
406 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
407 struct sock *sk;
408 int hash;
410 /* Check TCP register quick cache first. */
411 sk = TCP_RHASH(sport);
412 if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
413 goto hit;
415 /* Optimize here for direct hit, only listening connections can
416 * have wildcards anyways.
418 hash = tcp_hashfn(daddr, hnum, saddr, sport);
419 for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
420 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
421 if (sk->state == TCP_ESTABLISHED)
422 TCP_RHASH(sport) = sk;
423 goto hit; /* You sunk my battleship! */
426 /* Must check for a TIME_WAIT'er before going to listener hash. */
427 for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
428 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
429 goto hit;
430 sk = tcp_v4_lookup_listener(daddr, hnum, dif);
431 hit:
432 return sk;
435 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
437 return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
440 #ifdef CONFIG_IP_TRANSPARENT_PROXY
441 /* Cleaned up a little and adapted to new bind bucket scheme.
442 * Oddly, this should increase performance here for
443 * transparent proxy, as tests within the inner loop have
444 * been eliminated. -DaveM
446 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
447 unsigned short rnum, unsigned long laddr,
448 struct device *dev, unsigned short pnum,
449 int dif)
451 struct sock *s, *result = NULL;
452 int badness = -1;
453 u32 paddr = 0;
454 unsigned short hnum = ntohs(num);
455 unsigned short hpnum = ntohs(pnum);
456 int firstpass = 1;
458 if(dev && dev->ip_ptr) {
459 struct in_device *idev = dev->ip_ptr;
461 if(idev->ifa_list)
462 paddr = idev->ifa_list->ifa_local;
465 /* This code must run only from NET_BH. */
467 struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
468 for( ; (tb && tb->port != hnum); tb = tb->next)
470 if(tb == NULL)
471 goto next;
472 s = tb->owners;
474 pass2:
475 for(; s; s = s->bind_next) {
476 int score = 0;
477 if(s->rcv_saddr) {
478 if((s->num != hpnum || s->rcv_saddr != paddr) &&
479 (s->num != hnum || s->rcv_saddr != laddr))
480 continue;
481 score++;
483 if(s->daddr) {
484 if(s->daddr != raddr)
485 continue;
486 score++;
488 if(s->dport) {
489 if(s->dport != rnum)
490 continue;
491 score++;
493 if(s->bound_dev_if) {
494 if(s->bound_dev_if != dif)
495 continue;
496 score++;
498 if(score == 4 && s->num == hnum) {
499 result = s;
500 goto gotit;
501 } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
502 result = s;
503 badness = score;
506 next:
507 if(firstpass--) {
508 struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
509 for( ; (tb && tb->port != hpnum); tb = tb->next)
511 if(tb) {
512 s = tb->owners;
513 goto pass2;
516 gotit:
517 return result;
519 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
521 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
523 return secure_tcp_sequence_number(sk->saddr, sk->daddr,
524 skb->h.th->dest,
525 skb->h.th->source);
528 /* Check that a TCP address is unique, don't allow multiple
529 * connects to/from the same address. Actually we can optimize
530 * quite a bit, since the socket about to connect is still
531 * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
532 * use will exist, with a NULL owners list. So check for that.
533 * The good_socknum and verify_bind scheme we use makes this
534 * work.
536 static int tcp_v4_unique_address(struct sock *sk)
538 struct tcp_bind_bucket *tb;
539 unsigned short snum = sk->num;
540 int retval = 1;
542 /* Freeze the hash while we snoop around. */
543 SOCKHASH_LOCK();
544 tb = tcp_bound_hash[tcp_bhashfn(snum)];
545 for(; tb; tb = tb->next) {
546 if(tb->port == snum && tb->owners != NULL) {
547 /* Almost certainly the re-use port case, search the real hashes
548 * so it actually scales.
550 sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
551 sk->rcv_saddr, snum, sk->bound_dev_if);
552 if((sk != NULL) && (sk->state != TCP_LISTEN))
553 retval = 0;
554 break;
557 SOCKHASH_UNLOCK();
558 return retval;
561 /* This will initiate an outgoing connection. */
562 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
564 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
565 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
566 struct sk_buff *buff;
567 struct rtable *rt;
568 u32 daddr, nexthop;
569 int tmp;
571 if (sk->state != TCP_CLOSE)
572 return(-EISCONN);
574 /* Don't allow a double connect. */
575 if (sk->daddr)
576 return -EINVAL;
578 if (addr_len < sizeof(struct sockaddr_in))
579 return(-EINVAL);
581 if (usin->sin_family != AF_INET) {
582 static int complained;
583 if (usin->sin_family)
584 return(-EAFNOSUPPORT);
585 if (!complained++)
586 printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
589 nexthop = daddr = usin->sin_addr.s_addr;
590 if (sk->opt && sk->opt->srr) {
591 if (daddr == 0)
592 return -EINVAL;
593 nexthop = sk->opt->faddr;
596 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
597 RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
598 if (tmp < 0)
599 return tmp;
601 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
602 ip_rt_put(rt);
603 return -ENETUNREACH;
606 dst_release(xchg(&sk->dst_cache, rt));
608 buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
609 0, GFP_KERNEL);
611 if (buff == NULL)
612 return -ENOBUFS;
614 /* Socket has no identity, so lock_sock() is useless. Also
615 * since state==TCP_CLOSE (checked above) the socket cannot
616 * possibly be in the hashes. TCP hash locking is only
617 * needed while checking quickly for a unique address.
618 * However, the socket does need to be (and is) locked
619 * in tcp_connect().
620 * Perhaps this addresses all of ANK's concerns. 8-) -DaveM
622 sk->dport = usin->sin_port;
623 sk->daddr = rt->rt_dst;
624 if (sk->opt && sk->opt->srr)
625 sk->daddr = daddr;
626 if (!sk->saddr)
627 sk->saddr = rt->rt_src;
628 sk->rcv_saddr = sk->saddr;
630 if (!tcp_v4_unique_address(sk)) {
631 kfree_skb(buff);
632 return -EADDRNOTAVAIL;
635 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
636 sk->sport, usin->sin_port);
638 tp->ext_header_len = 0;
639 if (sk->opt)
640 tp->ext_header_len = sk->opt->optlen;
642 /* Reset mss clamp */
643 tp->mss_clamp = ~0;
645 if (!ip_dont_fragment(sk, &rt->u.dst) &&
646 rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
647 /* Clamp mss at maximum of 536 and user_mss.
648 Probably, user ordered to override tiny segment size
649 in gatewayed case.
651 tp->mss_clamp = max(tp->user_mss, 536);
654 tcp_connect(sk, buff, rt->u.dst.pmtu);
655 return 0;
658 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
660 struct tcp_opt *tp;
661 int retval = -EINVAL;
663 /* Do sanity checking for sendmsg/sendto/send. */
664 if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
665 goto out;
666 if (msg->msg_name) {
667 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
669 if (msg->msg_namelen < sizeof(*addr))
670 goto out;
671 if (addr->sin_family && addr->sin_family != AF_INET)
672 goto out;
673 retval = -ENOTCONN;
674 if(sk->state == TCP_CLOSE)
675 goto out;
676 retval = -EISCONN;
677 if (addr->sin_port != sk->dport)
678 goto out;
679 if (addr->sin_addr.s_addr != sk->daddr)
680 goto out;
683 lock_sock(sk);
684 retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
685 msg->msg_flags);
686 /* Push out partial tail frames if needed. */
687 tp = &(sk->tp_pinfo.af_tcp);
688 if(tp->send_head && tcp_snd_test(sk, tp->send_head))
689 tcp_write_xmit(sk);
690 release_sock(sk);
692 out:
693 return retval;
698 * Do a linear search in the socket open_request list.
699 * This should be replaced with a global hash table.
701 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
702 struct iphdr *iph,
703 struct tcphdr *th,
704 struct open_request **prevp)
706 struct open_request *req, *prev;
707 __u16 rport = th->source;
709 /* assumption: the socket is not in use.
710 * as we checked the user count on tcp_rcv and we're
711 * running from a soft interrupt.
713 prev = (struct open_request *) (&tp->syn_wait_queue);
714 for (req = prev->dl_next; req; req = req->dl_next) {
715 if (req->af.v4_req.rmt_addr == iph->saddr &&
716 req->af.v4_req.loc_addr == iph->daddr &&
717 req->rmt_port == rport
718 #ifdef CONFIG_IP_TRANSPARENT_PROXY
719 && req->lcl_port == th->dest
720 #endif
722 *prevp = prev;
723 return req;
725 prev = req;
727 return NULL;
732 * This routine does path mtu discovery as defined in RFC1191.
734 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
736 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
738 /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
739 * send out by Linux are always <576bytes so they should go through
740 * unfragmented).
742 if (sk->state == TCP_LISTEN)
743 return;
745 /* We don't check in the destentry if pmtu discovery is forbidden
746 * on this route. We just assume that no packet_to_big packets
747 * are send back when pmtu discovery is not active.
748 * There is a small race when the user changes this flag in the
749 * route, but I think that's acceptable.
751 if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
752 if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
753 !atomic_read(&sk->sock_readers)) {
754 lock_sock(sk);
755 tcp_sync_mss(sk, sk->dst_cache->pmtu);
757 /* Resend the TCP packet because it's
758 * clear that the old packet has been
759 * dropped. This is the new "fast" path mtu
760 * discovery.
762 tcp_simple_retransmit(sk);
763 release_sock(sk);
764 } /* else let the usual retransmit timer handle it */
769 * This routine is called by the ICMP module when it gets some
770 * sort of error condition. If err < 0 then the socket should
771 * be closed and the error returned to the user. If err > 0
772 * it's just the icmp type << 8 | icmp code. After adjustment
773 * header points to the first 8 bytes of the tcp header. We need
774 * to find the appropriate port.
776 * The locking strategy used here is very "optimistic". When
777 * someone else accesses the socket the ICMP is just dropped
778 * and for some paths there is no check at all.
779 * A more general error queue to queue errors for later handling
780 * is probably better.
782 * sk->err and sk->err_soft should be atomic_t.
785 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
787 struct iphdr *iph = (struct iphdr*)dp;
788 struct tcphdr *th;
789 struct tcp_opt *tp;
790 int type = skb->h.icmph->type;
791 int code = skb->h.icmph->code;
792 struct sock *sk;
793 __u32 seq;
794 int err;
796 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
797 icmp_statistics.IcmpInErrors++;
798 return;
801 th = (struct tcphdr*)(dp+(iph->ihl<<2));
803 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
804 if (sk == NULL || sk->state == TCP_TIME_WAIT) {
805 icmp_statistics.IcmpInErrors++;
806 return;
809 tp = &sk->tp_pinfo.af_tcp;
810 seq = ntohl(th->seq);
811 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
812 net_statistics.OutOfWindowIcmps++;
813 return;
816 switch (type) {
817 case ICMP_SOURCE_QUENCH:
818 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
819 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
820 tp->snd_cwnd = tp->snd_ssthresh;
821 tp->snd_cwnd_cnt = 0;
822 tp->high_seq = tp->snd_nxt;
823 #endif
824 return;
825 case ICMP_PARAMETERPROB:
826 err = EPROTO;
827 break;
828 case ICMP_DEST_UNREACH:
829 if (code > NR_ICMP_UNREACH)
830 return;
832 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
833 do_pmtu_discovery(sk, iph);
834 return;
837 err = icmp_err_convert[code].errno;
838 break;
839 case ICMP_TIME_EXCEEDED:
840 err = EHOSTUNREACH;
841 break;
842 default:
843 return;
846 switch (sk->state) {
847 struct open_request *req, *prev;
848 case TCP_LISTEN:
849 /* Prevent race conditions with accept() -
850 * ICMP is unreliable.
852 if (atomic_read(&sk->sock_readers)) {
853 net_statistics.LockDroppedIcmps++;
854 /* If too many ICMPs get dropped on busy
855 * servers this needs to be solved differently.
857 return;
860 /* The final ACK of the handshake should be already
861 * handled in the new socket context, not here.
862 * Strictly speaking - an ICMP error for the final
863 * ACK should set the opening flag, but that is too
864 * complicated right now.
866 if (!th->syn && !th->ack)
867 return;
869 req = tcp_v4_search_req(tp, iph, th, &prev);
870 if (!req)
871 return;
872 if (seq != req->snt_isn) {
873 net_statistics.OutOfWindowIcmps++;
874 return;
876 if (req->sk) {
878 * Already in ESTABLISHED and a big socket is created,
879 * set error code there.
880 * The error will _not_ be reported in the accept(),
881 * but only with the next operation on the socket after
882 * accept.
884 sk = req->sk;
885 } else {
887 * Still in SYN_RECV, just remove it silently.
888 * There is no good way to pass the error to the newly
889 * created socket, and POSIX does not want network
890 * errors returned from accept().
892 tp->syn_backlog--;
893 tcp_synq_unlink(tp, req, prev);
894 req->class->destructor(req);
895 tcp_openreq_free(req);
896 return;
898 break;
899 case TCP_SYN_SENT:
900 case TCP_SYN_RECV: /* Cannot happen */
901 if (!th->syn)
902 return;
903 tcp_statistics.TcpAttemptFails++;
904 sk->err = err;
905 sk->zapped = 1;
906 mb();
907 sk->error_report(sk);
908 return;
911 /* If we've already connected we will keep trying
912 * until we time out, or the user gives up.
914 * rfc1122 4.2.3.9 allows to consider as hard errors
915 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
916 * but it is obsoleted by pmtu discovery).
918 * Note, that in modern internet, where routing is unreliable
919 * and in each dark corner broken firewalls sit, sending random
920 * errors ordered by their masters even this two messages finally lose
921 * their original sense (even Linux sends invalid PORT_UNREACHs)
923 * Now we are in compliance with RFCs.
924 * --ANK (980905)
927 if (sk->ip_recverr) {
928 /* This code isn't serialized with the socket code */
929 /* ANK (980927) ... which is harmless now,
930 sk->err's may be safely lost.
932 sk->err = err;
933 mb();
934 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
935 } else { /* Only an error on timeout */
936 sk->err_soft = err;
937 mb();
941 /* This routine computes an IPv4 TCP checksum. */
942 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
943 struct sk_buff *skb)
945 th->check = 0;
946 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
947 csum_partial((char *)th, th->doff<<2, skb->csum));
951 * This routine will send an RST to the other tcp.
953 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
954 * for reset.
955 * Answer: if a packet caused RST, it is not for a socket
956 * existing in our system, if it is matched to a socket,
957 * it is just duplicate segment or bug in other side's TCP.
958 * So that we build reply only basing on parameters
959 * arrived with segment.
960 * Exception: precedence violation. We do not implement it in any case.
963 static void tcp_v4_send_reset(struct sk_buff *skb)
965 struct tcphdr *th = skb->h.th;
966 struct tcphdr rth;
967 struct ip_reply_arg arg;
969 /* Never send a reset in response to a reset. */
970 if (th->rst)
971 return;
973 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
974 #ifdef CONFIG_IP_TRANSPARENT_PROXY
975 if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
976 icmp_send(skb, ICMP_DEST_UNREACH,
977 ICMP_PORT_UNREACH, 0);
978 #endif
979 return;
982 /* Swap the send and the receive. */
983 memset(&rth, 0, sizeof(struct tcphdr));
984 rth.dest = th->source;
985 rth.source = th->dest;
986 rth.doff = sizeof(struct tcphdr)/4;
987 rth.rst = 1;
989 if (th->ack) {
990 rth.seq = th->ack_seq;
991 } else {
992 rth.ack = 1;
993 rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
996 memset(&arg, 0, sizeof arg);
997 arg.iov[0].iov_base = (unsigned char *)&rth;
998 arg.iov[0].iov_len = sizeof rth;
999 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1000 skb->nh.iph->saddr, /*XXX*/
1001 sizeof(struct tcphdr),
1002 IPPROTO_TCP,
1003 0);
1004 arg.n_iov = 1;
1005 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1007 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1009 tcp_statistics.TcpOutSegs++;
1010 tcp_statistics.TcpOutRsts++;
1013 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1016 Seems, I never wrote nothing more stupid.
1017 I hope Gods will forgive me, but I cannot forgive myself 8)
1018 --ANK (981001)
1021 static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
1023 struct iphdr *iph = skb->nh.iph;
1024 struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1025 struct sock *sk;
1026 int i;
1028 for (i=0; i<TCP_LHTABLE_SIZE; i++) {
1029 for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
1030 struct open_request *dummy;
1031 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
1032 th, &dummy) &&
1033 (!sk->bound_dev_if ||
1034 sk->bound_dev_if == skb->dev->ifindex))
1035 return sk;
1038 return NULL;
1042 * Check whether a received TCP packet might be for one of our
1043 * connections.
1046 int tcp_chkaddr(struct sk_buff *skb)
1048 struct iphdr *iph = skb->nh.iph;
1049 struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1050 struct sock *sk;
1052 sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
1053 th->dest, skb->dev->ifindex);
1055 if (!sk)
1056 return tcp_v4_search_proxy_openreq(skb) != NULL;
1058 if (sk->state == TCP_LISTEN) {
1059 struct open_request *dummy;
1060 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
1061 th, &dummy) &&
1062 (!sk->bound_dev_if ||
1063 sk->bound_dev_if == skb->dev->ifindex))
1064 return 1;
1067 /* 0 means accept all LOCAL addresses here, not all the world... */
1069 if (sk->rcv_saddr == 0)
1070 return 0;
1072 return 1;
1074 #endif
1077 * Send a SYN-ACK after having received an ACK.
1078 * This still operates on a open_request only, not on a big
1079 * socket.
1081 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1083 struct rtable *rt;
1084 struct ip_options *opt;
1085 struct sk_buff * skb;
1086 int mss;
1088 /* First, grab a route. */
1089 opt = req->af.v4_req.opt;
1090 if(ip_route_output(&rt, ((opt && opt->srr) ?
1091 opt->faddr :
1092 req->af.v4_req.rmt_addr),
1093 req->af.v4_req.loc_addr,
1094 RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
1095 sk->bound_dev_if)) {
1096 ip_statistics.IpOutNoRoutes++;
1097 return;
1099 if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1100 ip_rt_put(rt);
1101 ip_statistics.IpOutNoRoutes++;
1102 return;
1105 mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1107 skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
1108 if (skb) {
1109 struct tcphdr *th = skb->h.th;
1111 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1112 th->source = req->lcl_port; /* LVE */
1113 #endif
1115 th->check = tcp_v4_check(th, skb->len,
1116 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1117 csum_partial((char *)th, skb->len, skb->csum));
1119 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1120 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1122 ip_rt_put(rt);
1126 * IPv4 open_request destructor.
1128 static void tcp_v4_or_free(struct open_request *req)
1130 if(!req->sk && req->af.v4_req.opt)
1131 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1134 static inline void syn_flood_warning(struct sk_buff *skb)
1136 static unsigned long warntime;
1138 if (jiffies - warntime > HZ*60) {
1139 warntime = jiffies;
1140 printk(KERN_INFO
1141 "possible SYN flooding on port %d. Sending cookies.\n",
1142 ntohs(skb->h.th->dest));
1147 * Save and compile IPv4 options into the open_request if needed.
1149 static inline struct ip_options *
1150 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1152 struct ip_options *opt = &(IPCB(skb)->opt);
1153 struct ip_options *dopt = NULL;
1155 if (opt && opt->optlen) {
1156 int opt_size = optlength(opt);
1157 dopt = kmalloc(opt_size, GFP_ATOMIC);
1158 if (dopt) {
1159 if (ip_options_echo(dopt, skb)) {
1160 kfree_s(dopt, opt_size);
1161 dopt = NULL;
1165 return dopt;
1169 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1170 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1171 * It would be better to replace it with a global counter for all sockets
1172 * but then some measure against one socket starving all other sockets
1173 * would be needed.
1175 int sysctl_max_syn_backlog = 128;
1177 struct or_calltable or_ipv4 = {
1178 tcp_v4_send_synack,
1179 tcp_v4_or_free,
1180 tcp_v4_send_reset
1183 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1184 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1186 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
1188 struct tcp_opt tp;
1189 struct open_request *req;
1190 struct tcphdr *th = skb->h.th;
1191 __u32 saddr = skb->nh.iph->saddr;
1192 __u32 daddr = skb->nh.iph->daddr;
1193 #ifdef CONFIG_SYN_COOKIES
1194 int want_cookie = 0;
1195 #else
1196 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1197 #endif
1199 /* If the socket is dead, don't accept the connection. */
1200 if (sk->dead)
1201 goto dead;
1203 /* Never answer to SYNs send to broadcast or multicast */
1204 if (((struct rtable *)skb->dst)->rt_flags &
1205 (RTCF_BROADCAST|RTCF_MULTICAST))
1206 goto drop;
1208 /* XXX: Check against a global syn pool counter. */
1209 if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1210 #ifdef CONFIG_SYN_COOKIES
1211 if (sysctl_tcp_syncookies) {
1212 syn_flood_warning(skb);
1213 want_cookie = 1;
1214 } else
1215 #endif
1216 goto drop;
1217 } else {
1218 if (isn == 0)
1219 isn = tcp_v4_init_sequence(sk, skb);
1220 BACKLOG(sk)++;
1223 req = tcp_openreq_alloc();
1224 if (req == NULL) {
1225 goto dropbacklog;
1228 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1230 req->rcv_isn = TCP_SKB_CB(skb)->seq;
1231 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1233 tp.mss_clamp = 65535;
1234 tcp_parse_options(NULL, th, &tp, want_cookie);
1235 if (tp.mss_clamp == 65535)
1236 tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
1238 if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
1239 tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
1240 req->mss = tp.mss_clamp;
1242 if (tp.saw_tstamp)
1243 req->ts_recent = tp.rcv_tsval;
1244 req->tstamp_ok = tp.tstamp_ok;
1245 req->sack_ok = tp.sack_ok;
1246 req->snd_wscale = tp.snd_wscale;
1247 req->wscale_ok = tp.wscale_ok;
1248 req->rmt_port = th->source;
1249 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1250 req->lcl_port = th->dest ; /* LVE */
1251 #endif
1252 req->af.v4_req.loc_addr = daddr;
1253 req->af.v4_req.rmt_addr = saddr;
1255 /* Note that we ignore the isn passed from the TIME_WAIT
1256 * state here. That's the price we pay for cookies.
1258 if (want_cookie)
1259 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1261 req->snt_isn = isn;
1263 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1265 req->class = &or_ipv4;
1266 req->retrans = 0;
1267 req->sk = NULL;
1269 tcp_v4_send_synack(sk, req);
1271 if (want_cookie) {
1272 if (req->af.v4_req.opt)
1273 kfree(req->af.v4_req.opt);
1274 tcp_v4_or_free(req);
1275 tcp_openreq_free(req);
1276 } else {
1277 req->expires = jiffies + TCP_TIMEOUT_INIT;
1278 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1279 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1282 return 0;
1284 dead:
1285 SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1286 tcp_statistics.TcpAttemptFails++;
1287 return -ENOTCONN; /* send reset */
1289 dropbacklog:
1290 if (!want_cookie)
1291 BACKLOG(sk)--;
1292 drop:
1293 tcp_statistics.TcpAttemptFails++;
1294 return 0;
1297 /* This is not only more efficient than what we used to do, it eliminates
1298 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
1300 * This function wants to be moved to a common for IPv[46] file. --ANK
1302 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
1304 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
1306 if(newsk != NULL) {
1307 struct tcp_opt *newtp;
1309 memcpy(newsk, sk, sizeof(*newsk));
1310 newsk->sklist_next = NULL;
1311 newsk->state = TCP_SYN_RECV;
1313 /* Clone the TCP header template */
1314 newsk->dport = req->rmt_port;
1316 atomic_set(&newsk->sock_readers, 0);
1317 atomic_set(&newsk->rmem_alloc, 0);
1318 skb_queue_head_init(&newsk->receive_queue);
1319 atomic_set(&newsk->wmem_alloc, 0);
1320 skb_queue_head_init(&newsk->write_queue);
1321 atomic_set(&newsk->omem_alloc, 0);
1323 newsk->done = 0;
1324 newsk->proc = 0;
1325 newsk->pair = NULL;
1326 skb_queue_head_init(&newsk->back_log);
1327 skb_queue_head_init(&newsk->error_queue);
1329 /* Now setup tcp_opt */
1330 newtp = &(newsk->tp_pinfo.af_tcp);
1331 newtp->pred_flags = 0;
1332 newtp->rcv_nxt = req->rcv_isn + 1;
1333 newtp->snd_nxt = req->snt_isn + 1;
1334 newtp->snd_una = req->snt_isn + 1;
1335 newtp->srtt = 0;
1336 newtp->ato = 0;
1337 newtp->snd_wl1 = req->rcv_isn;
1338 newtp->snd_wl2 = req->snt_isn;
1340 /* RFC1323: The window in SYN & SYN/ACK segments
1341 * is never scaled.
1343 newtp->snd_wnd = ntohs(skb->h.th->window);
1345 newtp->max_window = newtp->snd_wnd;
1346 newtp->pending = 0;
1347 newtp->retransmits = 0;
1348 newtp->last_ack_sent = req->rcv_isn + 1;
1349 newtp->backoff = 0;
1350 newtp->mdev = TCP_TIMEOUT_INIT;
1351 newtp->snd_cwnd = 1;
1352 newtp->rto = TCP_TIMEOUT_INIT;
1353 newtp->packets_out = 0;
1354 newtp->fackets_out = 0;
1355 newtp->retrans_out = 0;
1356 newtp->high_seq = 0;
1357 newtp->snd_ssthresh = 0x7fffffff;
1358 newtp->snd_cwnd_cnt = 0;
1359 newtp->dup_acks = 0;
1360 newtp->delayed_acks = 0;
1361 init_timer(&newtp->retransmit_timer);
1362 newtp->retransmit_timer.function = &tcp_retransmit_timer;
1363 newtp->retransmit_timer.data = (unsigned long) newsk;
1364 init_timer(&newtp->delack_timer);
1365 newtp->delack_timer.function = &tcp_delack_timer;
1366 newtp->delack_timer.data = (unsigned long) newsk;
1367 skb_queue_head_init(&newtp->out_of_order_queue);
1368 newtp->send_head = newtp->retrans_head = NULL;
1369 newtp->rcv_wup = req->rcv_isn + 1;
1370 newtp->write_seq = req->snt_isn + 1;
1371 newtp->copied_seq = req->rcv_isn + 1;
1373 newtp->saw_tstamp = 0;
1374 newtp->mss_clamp = req->mss;
1376 init_timer(&newtp->probe_timer);
1377 newtp->probe_timer.function = &tcp_probe_timer;
1378 newtp->probe_timer.data = (unsigned long) newsk;
1379 newtp->probes_out = 0;
1380 newtp->syn_seq = req->rcv_isn;
1381 newtp->fin_seq = req->rcv_isn;
1382 newtp->urg_data = 0;
1383 tcp_synq_init(newtp);
1384 newtp->syn_backlog = 0;
1385 if (skb->len >= 536)
1386 newtp->last_seg_size = skb->len;
1388 /* Back to base struct sock members. */
1389 newsk->err = 0;
1390 newsk->ack_backlog = 0;
1391 newsk->max_ack_backlog = SOMAXCONN;
1392 newsk->priority = 0;
1394 /* IP layer stuff */
1395 newsk->timeout = 0;
1396 init_timer(&newsk->timer);
1397 newsk->timer.function = &net_timer;
1398 newsk->timer.data = (unsigned long) newsk;
1399 newsk->socket = NULL;
1401 newtp->tstamp_ok = req->tstamp_ok;
1402 if((newtp->sack_ok = req->sack_ok) != 0)
1403 newtp->num_sacks = 0;
1404 newtp->window_clamp = req->window_clamp;
1405 newtp->rcv_wnd = req->rcv_wnd;
1406 newtp->wscale_ok = req->wscale_ok;
1407 if (newtp->wscale_ok) {
1408 newtp->snd_wscale = req->snd_wscale;
1409 newtp->rcv_wscale = req->rcv_wscale;
1410 } else {
1411 newtp->snd_wscale = newtp->rcv_wscale = 0;
1412 newtp->window_clamp = min(newtp->window_clamp,65535);
1414 if (newtp->tstamp_ok) {
1415 newtp->ts_recent = req->ts_recent;
1416 newtp->ts_recent_stamp = jiffies;
1417 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
1418 } else {
1419 newtp->tcp_header_len = sizeof(struct tcphdr);
1422 return newsk;
1426 * The three way handshake has completed - we got a valid synack -
1427 * now create the new socket.
1429 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1430 struct open_request *req,
1431 struct dst_entry *dst)
1433 struct ip_options *opt = req->af.v4_req.opt;
1434 struct tcp_opt *newtp;
1435 struct sock *newsk;
1437 if (sk->ack_backlog > sk->max_ack_backlog)
1438 goto exit; /* head drop */
1439 if (dst == NULL) {
1440 struct rtable *rt;
1442 if (ip_route_output(&rt,
1443 opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1444 req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
1445 return NULL;
1446 dst = &rt->u.dst;
1448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1449 /* The new socket created for transparent proxy may fall
1450 * into a non-existed bind bucket because sk->num != newsk->num.
1451 * Ensure existance of the bucket now. The placement of the check
1452 * later will require to destroy just created newsk in the case of fail.
1453 * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
1455 if (tcp_bucket_check(ntohs(skb->h.th->dest)))
1456 goto exit;
1457 #endif
1459 newsk = tcp_create_openreq_child(sk, req, skb);
1460 if (!newsk)
1461 goto exit;
1463 sk->tp_pinfo.af_tcp.syn_backlog--;
1464 sk->ack_backlog++;
1466 newsk->dst_cache = dst;
1468 newtp = &(newsk->tp_pinfo.af_tcp);
1469 newsk->daddr = req->af.v4_req.rmt_addr;
1470 newsk->saddr = req->af.v4_req.loc_addr;
1471 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1472 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1473 newsk->num = ntohs(skb->h.th->dest);
1474 newsk->sport = req->lcl_port;
1475 #endif
1476 newsk->opt = req->af.v4_req.opt;
1477 newtp->ext_header_len = 0;
1478 if (newsk->opt)
1479 newtp->ext_header_len = newsk->opt->optlen;
1481 tcp_sync_mss(newsk, dst->pmtu);
1482 newtp->rcv_mss = newtp->mss_clamp;
1484 /* It would be better to use newtp->mss_clamp here */
1485 if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
1486 newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
1487 if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
1488 newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
1490 tcp_v4_hash(newsk);
1491 add_to_prot_sklist(newsk);
1492 sk->data_ready(sk, 0); /* Deliver SIGIO */
1494 return newsk;
1496 exit:
1497 dst_release(dst);
1498 return NULL;
1501 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1503 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1504 struct open_request *req, *prev;
1506 req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1507 if (!req)
1508 return;
1509 /* Sequence number check required by RFC793 */
1510 if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
1511 after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1512 return;
1513 tcp_synq_unlink(tp, req, prev);
1514 (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
1515 req->class->destructor(req);
1516 tcp_openreq_free(req);
1518 net_statistics.EmbryonicRsts++;
1521 /* Check for embryonic sockets (open_requests) We check packets with
1522 * only the SYN bit set against the open_request queue too: This
1523 * increases connection latency a bit, but is required to detect
1524 * retransmitted SYNs.
1526 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1528 struct tcphdr *th = skb->h.th;
1529 u32 flg = ((u32 *)th)[3];
1531 /* Check for RST */
1532 if (flg & __constant_htonl(0x00040000)) {
1533 tcp_v4_rst_req(sk, skb);
1534 return NULL;
1537 /* Check for SYN|ACK */
1538 if (flg & __constant_htonl(0x00120000)) {
1539 struct open_request *req, *dummy;
1540 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1542 /* Find possible connection requests. */
1543 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1544 if (req) {
1545 sk = tcp_check_req(sk, skb, req);
1547 #ifdef CONFIG_SYN_COOKIES
1548 else {
1549 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1551 #endif
1553 return sk;
1556 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1558 #ifdef CONFIG_FILTER
1559 if (sk->filter)
1561 if (sk_filter(skb, sk->filter_data, sk->filter))
1562 goto discard;
1564 #endif /* CONFIG_FILTER */
1567 * socket locking is here for SMP purposes as backlog rcv
1568 * is currently called with bh processing disabled.
1570 lock_sock(sk);
1573 * This doesn't check if the socket has enough room for the packet.
1574 * Either process the packet _without_ queueing it and then free it,
1575 * or do the check later.
1577 skb_set_owner_r(skb, sk);
1579 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1580 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1581 goto reset;
1582 release_sock(sk);
1583 return 0;
1587 if (sk->state == TCP_LISTEN) {
1588 struct sock *nsk;
1590 nsk = tcp_v4_hnd_req(sk, skb);
1591 if (!nsk)
1592 goto discard;
1593 lock_sock(nsk);
1594 release_sock(sk);
1595 sk = nsk;
1598 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1599 goto reset;
1600 release_sock(sk);
1601 return 0;
1603 reset:
1604 tcp_v4_send_reset(skb);
1605 discard:
1606 kfree_skb(skb);
1607 /* Be careful here. If this function gets more complicated and
1608 * gcc suffers from register pressure on the x86, sk (in %ebx)
1609 * might be destroyed here. This current version compiles correctly,
1610 * but you have been warned.
1612 release_sock(sk);
1613 return 0;
1617 * From tcp_input.c
1620 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1622 struct tcphdr *th;
1623 struct sock *sk;
1625 if (skb->pkt_type!=PACKET_HOST)
1626 goto discard_it;
1628 th = skb->h.th;
1630 /* Pull up the IP header. */
1631 __skb_pull(skb, skb->h.raw - skb->data);
1633 /* Count it even if it's bad */
1634 tcp_statistics.TcpInSegs++;
1636 if (len < sizeof(struct tcphdr))
1637 goto bad_packet;
1639 /* Try to use the device checksum if provided. */
1640 switch (skb->ip_summed) {
1641 case CHECKSUM_NONE:
1642 skb->csum = csum_partial((char *)th, len, 0);
1643 case CHECKSUM_HW:
1644 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1645 NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1646 "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1647 "len=%d/%d/%d\n",
1648 NIPQUAD(skb->nh.iph->saddr),
1649 ntohs(th->source),
1650 NIPQUAD(skb->nh.iph->daddr),
1651 ntohs(th->dest),
1652 len, skb->len,
1653 ntohs(skb->nh.iph->tot_len)));
1654 bad_packet:
1655 tcp_statistics.TcpInErrs++;
1656 goto discard_it;
1658 default:
1659 /* CHECKSUM_UNNECESSARY */
1662 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1663 if (IPCB(skb)->redirport)
1664 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1665 skb->nh.iph->daddr, skb->dev,
1666 IPCB(skb)->redirport, skb->dev->ifindex);
1667 else {
1668 #endif
1669 sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
1670 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1671 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1672 if (!sk)
1673 sk = tcp_v4_search_proxy_openreq(skb);
1675 #endif
1676 if (!sk)
1677 goto no_tcp_socket;
1678 if(!ipsec_sk_policy(sk,skb))
1679 goto discard_it;
1681 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1682 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1683 len - th->doff*4);
1684 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1686 skb->used = 0;
1688 if (sk->state == TCP_TIME_WAIT)
1689 goto do_time_wait;
1690 if (!atomic_read(&sk->sock_readers))
1691 return tcp_v4_do_rcv(sk, skb);
1693 __skb_queue_tail(&sk->back_log, skb);
1694 return 0;
1696 no_tcp_socket:
1697 tcp_v4_send_reset(skb);
1699 discard_it:
1700 /* Discard frame. */
1701 kfree_skb(skb);
1702 return 0;
1704 do_time_wait:
1705 if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1706 skb, th, skb->len))
1707 goto no_tcp_socket;
1708 goto discard_it;
1711 int tcp_v4_rebuild_header(struct sock *sk)
1713 struct rtable *rt = (struct rtable *)sk->dst_cache;
1714 __u32 new_saddr;
1715 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1717 if(rt == NULL)
1718 return 0;
1720 /* Force route checking if want_rewrite.
1721 * The idea is good, the implementation is disguisting.
1722 * Well, if I made bind on this socket, you cannot randomly ovewrite
1723 * its source address. --ANK
1725 if (want_rewrite) {
1726 int tmp;
1727 struct rtable *new_rt;
1728 __u32 old_saddr = rt->rt_src;
1730 /* Query new route using another rt buffer */
1731 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1732 RT_TOS(sk->ip_tos)|sk->localroute,
1733 sk->bound_dev_if);
1735 /* Only useful if different source addrs */
1736 if (tmp == 0) {
1738 * Only useful if different source addrs
1740 if (new_rt->rt_src != old_saddr ) {
1741 dst_release(sk->dst_cache);
1742 sk->dst_cache = &new_rt->u.dst;
1743 rt = new_rt;
1744 goto do_rewrite;
1746 dst_release(&new_rt->u.dst);
1749 if (rt->u.dst.obsolete) {
1750 int err;
1751 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1752 if (err) {
1753 sk->err_soft=-err;
1754 sk->error_report(sk);
1755 return -1;
1757 dst_release(xchg(&sk->dst_cache, &rt->u.dst));
1760 return 0;
1762 do_rewrite:
1763 new_saddr = rt->rt_src;
1765 /* Ouch!, this should not happen. */
1766 if (!sk->saddr || !sk->rcv_saddr) {
1767 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1768 "saddr=%08lX rcv_saddr=%08lX\n",
1769 ntohl(sk->saddr),
1770 ntohl(sk->rcv_saddr));
1771 return 0;
1774 if (new_saddr != sk->saddr) {
1775 if (sysctl_ip_dynaddr > 1) {
1776 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1777 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1778 NIPQUAD(sk->saddr),
1779 NIPQUAD(new_saddr));
1782 sk->saddr = new_saddr;
1783 sk->rcv_saddr = new_saddr;
1784 tcp_v4_rehash(sk);
1787 return 0;
1790 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1792 return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1793 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1796 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1798 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1800 sin->sin_family = AF_INET;
1801 sin->sin_addr.s_addr = sk->daddr;
1802 sin->sin_port = sk->dport;
1805 struct tcp_func ipv4_specific = {
1806 ip_queue_xmit,
1807 tcp_v4_send_check,
1808 tcp_v4_rebuild_header,
1809 tcp_v4_conn_request,
1810 tcp_v4_syn_recv_sock,
1811 tcp_v4_get_sock,
1812 sizeof(struct iphdr),
1814 ip_setsockopt,
1815 ip_getsockopt,
1816 v4_addr2sockaddr,
1817 sizeof(struct sockaddr_in)
1820 /* NOTE: A lot of things set to zero explicitly by call to
1821 * sk_alloc() so need not be done here.
1823 static int tcp_v4_init_sock(struct sock *sk)
1825 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1827 skb_queue_head_init(&tp->out_of_order_queue);
1828 tcp_init_xmit_timers(sk);
1830 tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
1831 tp->mdev = TCP_TIMEOUT_INIT;
1832 tp->mss_clamp = ~0;
1834 /* See draft-stevens-tcpca-spec-01 for discussion of the
1835 * initialization of these values.
1837 tp->snd_cwnd = 1;
1838 tp->snd_cwnd_cnt = 0;
1839 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1841 sk->state = TCP_CLOSE;
1842 sk->max_ack_backlog = SOMAXCONN;
1843 tp->rcv_mss = 536;
1845 sk->write_space = tcp_write_space;
1847 /* Init SYN queue. */
1848 tcp_synq_init(tp);
1850 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1852 return 0;
1855 static int tcp_v4_destroy_sock(struct sock *sk)
1857 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1858 struct sk_buff *skb;
1860 tcp_clear_xmit_timers(sk);
1862 if (sk->keepopen)
1863 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1865 /* Cleanup up the write buffer. */
1866 while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
1867 kfree_skb(skb);
1869 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1870 while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
1871 kfree_skb(skb);
1873 /* Clean up a locked TCP bind bucket, this only happens if a
1874 * port is allocated for a socket, but it never fully connects.
1875 * In which case we will find num to be non-zero and daddr to
1876 * be zero.
1878 if(sk->daddr == 0 && sk->num != 0)
1879 tcp_bucket_unlock(sk);
1881 return 0;
1884 struct proto tcp_prot = {
1885 (struct sock *)&tcp_prot, /* sklist_next */
1886 (struct sock *)&tcp_prot, /* sklist_prev */
1887 tcp_close, /* close */
1888 tcp_v4_connect, /* connect */
1889 tcp_accept, /* accept */
1890 NULL, /* retransmit */
1891 tcp_write_wakeup, /* write_wakeup */
1892 tcp_read_wakeup, /* read_wakeup */
1893 tcp_poll, /* poll */
1894 tcp_ioctl, /* ioctl */
1895 tcp_v4_init_sock, /* init */
1896 tcp_v4_destroy_sock, /* destroy */
1897 tcp_shutdown, /* shutdown */
1898 tcp_setsockopt, /* setsockopt */
1899 tcp_getsockopt, /* getsockopt */
1900 tcp_v4_sendmsg, /* sendmsg */
1901 tcp_recvmsg, /* recvmsg */
1902 NULL, /* bind */
1903 tcp_v4_do_rcv, /* backlog_rcv */
1904 tcp_v4_hash, /* hash */
1905 tcp_v4_unhash, /* unhash */
1906 tcp_v4_rehash, /* rehash */
1907 tcp_good_socknum, /* good_socknum */
1908 tcp_v4_verify_bind, /* verify_bind */
1909 128, /* max_header */
1910 0, /* retransmits */
1911 "TCP", /* name */
1912 0, /* inuse */
1913 0 /* highestinuse */
1918 __initfunc(void tcp_v4_init(struct net_proto_family *ops))
1920 int err;
1922 tcp_inode.i_mode = S_IFSOCK;
1923 tcp_inode.i_sock = 1;
1924 tcp_inode.i_uid = 0;
1925 tcp_inode.i_gid = 0;
1927 tcp_socket->inode = &tcp_inode;
1928 tcp_socket->state = SS_UNCONNECTED;
1929 tcp_socket->type=SOCK_RAW;
1931 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
1932 panic("Failed to create the TCP control socket.\n");
1933 tcp_socket->sk->allocation=GFP_ATOMIC;
1934 tcp_socket->sk->num = 256; /* Don't receive any data */
1935 tcp_socket->sk->ip_ttl = MAXTTL;