pre-2.3.4..
[davej-history.git] / net / ipv4 / tcp_ipv4.c
blob30aed245e134482d83b9e87e74f8aa0b7bbcea58
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.176 1999/05/12 11:24:46 davem Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/init.h>
55 #include <linux/ipsec.h>
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
61 #include <asm/segment.h>
63 #include <linux/inet.h>
64 #include <linux/stddef.h>
66 extern int sysctl_tcp_timestamps;
67 extern int sysctl_tcp_window_scaling;
68 extern int sysctl_tcp_sack;
69 extern int sysctl_tcp_syncookies;
70 extern int sysctl_ip_dynaddr;
71 extern __u32 sysctl_wmem_max;
72 extern __u32 sysctl_rmem_max;
74 /* Check TCP sequence numbers in ICMP packets. */
75 #define ICMP_MIN_LENGTH 8
77 /* Socket used for sending RSTs */
78 struct inode tcp_inode;
79 struct socket *tcp_socket=&tcp_inode.u.socket_i;
81 static void tcp_v4_send_reset(struct sk_buff *skb);
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
84 struct sk_buff *skb);
86 /* This is for sockets with full identity only. Sockets here will always
87 * be without wildcards and will have the following invariant:
88 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
90 * First half of the table is for sockets not in TIME_WAIT, second half
91 * is for TIME_WAIT sockets only.
93 struct sock **tcp_ehash;
94 int tcp_ehash_size;
96 /* Ok, let's try this, I give up, we do need a local binding
97 * TCP hash as well as the others for fast bind/connect.
99 struct tcp_bind_bucket **tcp_bhash;
100 int tcp_bhash_size;
102 /* All sockets in TCP_LISTEN state will be in here. This is the only table
103 * where wildcard'd TCP sockets can exist. Hash function here is just local
104 * port number.
106 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
108 /* Register cache. */
109 struct sock *tcp_regs[TCP_NUM_REGS];
112 * This array holds the first and last local port number.
113 * For high-usage systems, use sysctl to change this to
114 * 32768-61000
116 int sysctl_local_port_range[2] = { 1024, 4999 };
117 int tcp_port_rover = (1024 - 1);
119 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
120 __u32 faddr, __u16 fport)
122 return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
125 static __inline__ int tcp_sk_hashfn(struct sock *sk)
127 __u32 laddr = sk->rcv_saddr;
128 __u16 lport = sk->num;
129 __u32 faddr = sk->daddr;
130 __u16 fport = sk->dport;
132 return tcp_hashfn(laddr, lport, faddr, fport);
135 /* Invariant, sk->num is non-zero. */
136 void tcp_bucket_unlock(struct sock *sk)
138 struct tcp_bind_bucket *tb;
139 unsigned short snum = sk->num;
141 SOCKHASH_LOCK_WRITE();
142 for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
143 if(tb->port == snum) {
144 if(tb->owners == NULL &&
145 (tb->flags & TCPB_FLAG_LOCKED)) {
146 tb->flags &= ~(TCPB_FLAG_LOCKED |
147 TCPB_FLAG_FASTREUSE);
148 tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
150 break;
153 SOCKHASH_UNLOCK_WRITE();
156 /* The sockhash lock must be held as a writer here. */
157 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
159 struct tcp_bind_bucket *tb;
161 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
162 if(tb != NULL) {
163 struct tcp_bind_bucket **head =
164 &tcp_bhash[tcp_bhashfn(snum)];
165 tb->port = snum;
166 tb->flags = TCPB_FLAG_LOCKED;
167 tb->owners = NULL;
168 if((tb->next = *head) != NULL)
169 tb->next->pprev = &tb->next;
170 *head = tb;
171 tb->pprev = head;
173 return tb;
176 #ifdef CONFIG_IP_TRANSPARENT_PROXY
177 /* Ensure that the bound bucket for the port exists.
178 * Return 0 on success.
180 static __inline__ int tcp_bucket_check(unsigned short snum)
182 struct tcp_bind_bucket *tb;
183 int ret = 0;
185 SOCKHASH_LOCK_WRITE();
186 tb = tcp_bhash[tcp_bhashfn(snum)];
187 for( ; (tb && (tb->port != snum)); tb = tb->next)
189 if(tb == NULL && tcp_bucket_create(snum) == NULL)
190 ret = 1;
191 SOCKHASH_UNLOCK_WRITE();
193 return ret;
195 #endif
197 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
199 struct tcp_bind_bucket *tb;
200 int result = 0;
202 SOCKHASH_LOCK_WRITE();
203 for(tb = tcp_bhash[tcp_bhashfn(snum)];
204 (tb && (tb->port != snum));
205 tb = tb->next)
207 if(tb && tb->owners) {
208 /* Fast path for reuse ports, see include/net/tcp.h for a very
209 * detailed description of why this works, and why it is worth
210 * the effort at all. -DaveM
212 if((tb->flags & TCPB_FLAG_FASTREUSE) &&
213 (sk->reuse != 0)) {
214 goto go_like_smoke;
215 } else {
216 struct sock *sk2;
217 int sk_reuse = sk->reuse;
219 /* We must walk the whole port owner list in this case. -DaveM */
220 for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
221 if (sk->bound_dev_if == sk2->bound_dev_if) {
222 if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
223 if(!sk2->rcv_saddr ||
224 !sk->rcv_saddr ||
225 (sk2->rcv_saddr == sk->rcv_saddr))
226 break;
230 if(sk2 != NULL)
231 result = 1;
234 if(result == 0) {
235 if(tb == NULL) {
236 if((tb = tcp_bucket_create(snum)) == NULL)
237 result = 1;
238 else if (sk->reuse && sk->state != TCP_LISTEN)
239 tb->flags |= TCPB_FLAG_FASTREUSE;
240 } else {
241 /* It could be pending garbage collection, this
242 * kills the race and prevents it from disappearing
243 * out from under us by the time we use it. -DaveM
245 if(tb->owners == NULL) {
246 if (!(tb->flags & TCPB_FLAG_LOCKED)) {
247 tb->flags = (TCPB_FLAG_LOCKED |
248 ((sk->reuse &&
249 sk->state != TCP_LISTEN) ?
250 TCPB_FLAG_FASTREUSE : 0));
251 tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
252 } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
253 /* Someone is in between the bind
254 * and the actual connect or listen.
255 * See if it was a legitimate reuse
256 * and we are as well, else punt.
258 if (sk->reuse == 0 ||
259 !(tb->flags & TCPB_FLAG_FASTREUSE))
260 result = 1;
261 } else
262 tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
266 go_like_smoke:
267 SOCKHASH_UNLOCK_WRITE();
268 return result;
271 unsigned short tcp_good_socknum(void)
273 struct tcp_bind_bucket *tb;
274 int low = sysctl_local_port_range[0];
275 int high = sysctl_local_port_range[1];
276 int remaining = (high - low) + 1;
277 int rover;
279 SOCKHASH_LOCK_WRITE();
280 rover = tcp_port_rover;
281 do {
282 rover += 1;
283 if((rover < low) || (rover > high))
284 rover = low;
285 tb = tcp_bhash[tcp_bhashfn(rover)];
286 for( ; tb; tb = tb->next) {
287 if(tb->port == rover)
288 goto next;
290 break;
291 next:
292 } while(--remaining > 0);
293 tcp_port_rover = rover;
294 tb = NULL;
295 if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
296 rover = 0;
297 if (tb != NULL)
298 tb->flags |= TCPB_FLAG_GOODSOCKNUM;
299 SOCKHASH_UNLOCK_WRITE();
301 return rover;
304 static void tcp_v4_hash(struct sock *sk)
306 if (sk->state != TCP_CLOSE) {
307 struct sock **skp;
309 SOCKHASH_LOCK_WRITE();
310 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
311 if((sk->next = *skp) != NULL)
312 (*skp)->pprev = &sk->next;
313 *skp = sk;
314 sk->pprev = skp;
315 tcp_sk_bindify(sk);
316 SOCKHASH_UNLOCK_WRITE();
320 static void tcp_v4_unhash(struct sock *sk)
322 SOCKHASH_LOCK_WRITE();
323 if(sk->pprev) {
324 if(sk->next)
325 sk->next->pprev = sk->pprev;
326 *sk->pprev = sk->next;
327 sk->pprev = NULL;
328 tcp_reg_zap(sk);
329 tcp_sk_unbindify(sk);
331 SOCKHASH_UNLOCK_WRITE();
334 static void tcp_v4_rehash(struct sock *sk)
336 unsigned char state;
338 SOCKHASH_LOCK_WRITE();
339 state = sk->state;
340 if(sk->pprev != NULL) {
341 if(sk->next)
342 sk->next->pprev = sk->pprev;
343 *sk->pprev = sk->next;
344 sk->pprev = NULL;
345 tcp_reg_zap(sk);
347 if(state != TCP_CLOSE) {
348 struct sock **skp;
350 if(state == TCP_LISTEN)
351 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
352 else
353 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
355 if((sk->next = *skp) != NULL)
356 (*skp)->pprev = &sk->next;
357 *skp = sk;
358 sk->pprev = skp;
359 if(state == TCP_LISTEN)
360 tcp_sk_bindify(sk);
362 SOCKHASH_UNLOCK_WRITE();
365 /* Don't inline this cruft. Here are some nice properties to
366 * exploit here. The BSD API does not allow a listening TCP
367 * to specify the remote port nor the remote address for the
368 * connection. So always assume those are both wildcarded
369 * during the search since they can never be otherwise.
371 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
373 struct sock *sk;
374 struct sock *result = NULL;
375 int score, hiscore;
377 hiscore=0;
378 for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
379 if(sk->num == hnum) {
380 __u32 rcv_saddr = sk->rcv_saddr;
382 score = 1;
383 if(rcv_saddr) {
384 if (rcv_saddr != daddr)
385 continue;
386 score++;
388 if (sk->bound_dev_if) {
389 if (sk->bound_dev_if != dif)
390 continue;
391 score++;
393 if (score == 3)
394 return sk;
395 if (score > hiscore) {
396 hiscore = score;
397 result = sk;
401 return result;
404 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
405 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
407 * The sockhash lock must be held as a reader here.
409 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
410 u32 daddr, u16 dport, int dif)
412 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
413 __u16 hnum = ntohs(dport);
414 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
415 struct sock *sk;
416 int hash;
418 /* Check TCP register quick cache first. */
419 sk = TCP_RHASH(sport);
420 if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
421 goto hit;
423 /* Optimize here for direct hit, only listening connections can
424 * have wildcards anyways.
426 hash = tcp_hashfn(daddr, hnum, saddr, sport);
427 for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
428 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
429 if (sk->state == TCP_ESTABLISHED)
430 TCP_RHASH(sport) = sk;
431 goto hit; /* You sunk my battleship! */
434 /* Must check for a TIME_WAIT'er before going to listener hash. */
435 for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
436 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
437 goto hit;
438 sk = tcp_v4_lookup_listener(daddr, hnum, dif);
439 hit:
440 return sk;
443 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
445 struct sock *sk;
447 SOCKHASH_LOCK_READ();
448 sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
449 SOCKHASH_UNLOCK_READ();
451 return sk;
454 #ifdef CONFIG_IP_TRANSPARENT_PROXY
455 /* Cleaned up a little and adapted to new bind bucket scheme.
456 * Oddly, this should increase performance here for
457 * transparent proxy, as tests within the inner loop have
458 * been eliminated. -DaveM
460 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
461 unsigned short rnum, unsigned long laddr,
462 struct device *dev, unsigned short pnum,
463 int dif)
465 struct sock *s, *result = NULL;
466 int badness = -1;
467 u32 paddr = 0;
468 unsigned short hnum = ntohs(num);
469 unsigned short hpnum = ntohs(pnum);
470 int firstpass = 1;
472 if(dev && dev->ip_ptr) {
473 struct in_device *idev = dev->ip_ptr;
475 if(idev->ifa_list)
476 paddr = idev->ifa_list->ifa_local;
479 /* We must obtain the sockhash lock here, we are always
480 * in BH context.
482 SOCKHASH_LOCK_READ_BH();
484 struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
485 for( ; (tb && tb->port != hnum); tb = tb->next)
487 if(tb == NULL)
488 goto next;
489 s = tb->owners;
491 pass2:
492 for(; s; s = s->bind_next) {
493 int score = 0;
494 if(s->rcv_saddr) {
495 if((s->num != hpnum || s->rcv_saddr != paddr) &&
496 (s->num != hnum || s->rcv_saddr != laddr))
497 continue;
498 score++;
500 if(s->daddr) {
501 if(s->daddr != raddr)
502 continue;
503 score++;
505 if(s->dport) {
506 if(s->dport != rnum)
507 continue;
508 score++;
510 if(s->bound_dev_if) {
511 if(s->bound_dev_if != dif)
512 continue;
513 score++;
515 if(score == 4 && s->num == hnum) {
516 result = s;
517 goto gotit;
518 } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
519 result = s;
520 badness = score;
523 next:
524 if(firstpass--) {
525 struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
526 for( ; (tb && tb->port != hpnum); tb = tb->next)
528 if(tb) {
529 s = tb->owners;
530 goto pass2;
533 gotit:
534 SOCKHASH_UNLOCK_READ_BH();
535 return result;
537 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
539 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
541 return secure_tcp_sequence_number(sk->saddr, sk->daddr,
542 skb->h.th->dest,
543 skb->h.th->source);
546 /* Check that a TCP address is unique, don't allow multiple
547 * connects to/from the same address. Actually we can optimize
548 * quite a bit, since the socket about to connect is still
549 * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
550 * use will exist, with a NULL owners list. So check for that.
551 * The good_socknum and verify_bind scheme we use makes this
552 * work.
554 static int tcp_v4_unique_address(struct sock *sk)
556 struct tcp_bind_bucket *tb;
557 unsigned short snum = sk->num;
558 int retval = 1;
560 /* Freeze the hash while we snoop around. */
561 SOCKHASH_LOCK_READ();
562 tb = tcp_bhash[tcp_bhashfn(snum)];
563 for(; tb; tb = tb->next) {
564 if(tb->port == snum && tb->owners != NULL) {
565 /* Almost certainly the re-use port case, search the real hashes
566 * so it actually scales.
568 sk = __tcp_v4_lookup(sk->daddr, sk->dport,
569 sk->rcv_saddr, snum, sk->bound_dev_if);
570 SOCKHASH_UNLOCK_READ();
572 if((sk != NULL) && (sk->state != TCP_LISTEN))
573 retval = 0;
574 return retval;
577 SOCKHASH_UNLOCK_READ();
578 return retval;
581 /* This will initiate an outgoing connection. */
582 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
584 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
586 struct sk_buff *buff;
587 struct rtable *rt;
588 u32 daddr, nexthop;
589 int tmp;
591 if (sk->state != TCP_CLOSE)
592 return(-EISCONN);
594 /* Don't allow a double connect. */
595 if (sk->daddr)
596 return -EINVAL;
598 if (addr_len < sizeof(struct sockaddr_in))
599 return(-EINVAL);
601 if (usin->sin_family != AF_INET) {
602 static int complained;
603 if (usin->sin_family)
604 return(-EAFNOSUPPORT);
605 if (!complained++)
606 printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
609 nexthop = daddr = usin->sin_addr.s_addr;
610 if (sk->opt && sk->opt->srr) {
611 if (daddr == 0)
612 return -EINVAL;
613 nexthop = sk->opt->faddr;
616 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
617 RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
618 if (tmp < 0)
619 return tmp;
621 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
622 ip_rt_put(rt);
623 return -ENETUNREACH;
626 dst_release(xchg(&sk->dst_cache, rt));
628 buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
629 0, GFP_KERNEL);
631 if (buff == NULL)
632 return -ENOBUFS;
634 /* Socket has no identity, so lock_sock() is useless. Also
635 * since state==TCP_CLOSE (checked above) the socket cannot
636 * possibly be in the hashes. TCP hash locking is only
637 * needed while checking quickly for a unique address.
638 * However, the socket does need to be (and is) locked
639 * in tcp_connect().
640 * Perhaps this addresses all of ANK's concerns. 8-) -DaveM
642 sk->dport = usin->sin_port;
643 sk->daddr = rt->rt_dst;
644 if (sk->opt && sk->opt->srr)
645 sk->daddr = daddr;
646 if (!sk->saddr)
647 sk->saddr = rt->rt_src;
648 sk->rcv_saddr = sk->saddr;
650 if (!tcp_v4_unique_address(sk)) {
651 kfree_skb(buff);
652 sk->daddr = 0;
653 return -EADDRNOTAVAIL;
656 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
657 sk->sport, usin->sin_port);
659 tp->ext_header_len = 0;
660 if (sk->opt)
661 tp->ext_header_len = sk->opt->optlen;
663 /* Reset mss clamp */
664 tp->mss_clamp = ~0;
666 if (!ip_dont_fragment(sk, &rt->u.dst) &&
667 rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
668 /* Clamp mss at maximum of 536 and user_mss.
669 Probably, user ordered to override tiny segment size
670 in gatewayed case.
672 tp->mss_clamp = max(tp->user_mss, 536);
675 tcp_connect(sk, buff, rt->u.dst.pmtu);
676 return 0;
679 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
681 int retval = -EINVAL;
683 /* Do sanity checking for sendmsg/sendto/send. */
684 if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
685 goto out;
686 if (msg->msg_name) {
687 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
689 if (msg->msg_namelen < sizeof(*addr))
690 goto out;
691 if (addr->sin_family && addr->sin_family != AF_INET)
692 goto out;
693 retval = -ENOTCONN;
694 if(sk->state == TCP_CLOSE)
695 goto out;
696 retval = -EISCONN;
697 if (addr->sin_port != sk->dport)
698 goto out;
699 if (addr->sin_addr.s_addr != sk->daddr)
700 goto out;
702 retval = tcp_do_sendmsg(sk, msg);
704 out:
705 return retval;
710 * Do a linear search in the socket open_request list.
711 * This should be replaced with a global hash table.
713 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
714 struct iphdr *iph,
715 struct tcphdr *th,
716 struct open_request **prevp)
718 struct open_request *req, *prev;
719 __u16 rport = th->source;
721 /* assumption: the socket is not in use.
722 * as we checked the user count on tcp_rcv and we're
723 * running from a soft interrupt.
725 prev = (struct open_request *) (&tp->syn_wait_queue);
726 for (req = prev->dl_next; req; req = req->dl_next) {
727 if (req->af.v4_req.rmt_addr == iph->saddr &&
728 req->af.v4_req.loc_addr == iph->daddr &&
729 req->rmt_port == rport
730 #ifdef CONFIG_IP_TRANSPARENT_PROXY
731 && req->lcl_port == th->dest
732 #endif
734 *prevp = prev;
735 return req;
737 prev = req;
739 return NULL;
744 * This routine does path mtu discovery as defined in RFC1191.
746 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
748 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
750 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
751 * send out by Linux are always <576bytes so they should go through
752 * unfragmented).
754 if (sk->state == TCP_LISTEN)
755 return;
757 bh_lock_sock(sk);
758 if(sk->lock.users != 0)
759 goto out;
761 /* We don't check in the destentry if pmtu discovery is forbidden
762 * on this route. We just assume that no packet_to_big packets
763 * are send back when pmtu discovery is not active.
764 * There is a small race when the user changes this flag in the
765 * route, but I think that's acceptable.
767 if (sk->dst_cache == NULL)
768 goto out;
770 ip_rt_update_pmtu(sk->dst_cache, mtu);
771 if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
772 tp->pmtu_cookie > sk->dst_cache->pmtu) {
773 tcp_sync_mss(sk, sk->dst_cache->pmtu);
775 /* Resend the TCP packet because it's
776 * clear that the old packet has been
777 * dropped. This is the new "fast" path mtu
778 * discovery.
780 tcp_simple_retransmit(sk);
781 } /* else let the usual retransmit timer handle it */
782 out:
783 bh_unlock_sock(sk);
787 * This routine is called by the ICMP module when it gets some
788 * sort of error condition. If err < 0 then the socket should
789 * be closed and the error returned to the user. If err > 0
790 * it's just the icmp type << 8 | icmp code. After adjustment
791 * header points to the first 8 bytes of the tcp header. We need
792 * to find the appropriate port.
794 * The locking strategy used here is very "optimistic". When
795 * someone else accesses the socket the ICMP is just dropped
796 * and for some paths there is no check at all.
797 * A more general error queue to queue errors for later handling
798 * is probably better.
800 * sk->err and sk->err_soft should be atomic_t.
803 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
805 struct iphdr *iph = (struct iphdr*)dp;
806 struct tcphdr *th;
807 struct tcp_opt *tp;
808 int type = skb->h.icmph->type;
809 int code = skb->h.icmph->code;
810 #if ICMP_MIN_LENGTH < 14
811 int no_flags = 0;
812 #else
813 #define no_flags 0
814 #endif
815 struct sock *sk;
816 __u32 seq;
817 int err;
819 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
820 icmp_statistics.IcmpInErrors++;
821 return;
823 #if ICMP_MIN_LENGTH < 14
824 if (len < (iph->ihl << 2) + 14)
825 no_flags = 1;
826 #endif
828 th = (struct tcphdr*)(dp+(iph->ihl<<2));
830 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
831 if (sk == NULL || sk->state == TCP_TIME_WAIT) {
832 icmp_statistics.IcmpInErrors++;
833 return;
836 tp = &sk->tp_pinfo.af_tcp;
837 seq = ntohl(th->seq);
838 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
839 net_statistics.OutOfWindowIcmps++;
840 return;
843 switch (type) {
844 case ICMP_SOURCE_QUENCH:
845 #ifndef OLD_SOURCE_QUENCH /* This is deprecated */
846 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
847 tp->snd_cwnd = tp->snd_ssthresh;
848 tp->snd_cwnd_cnt = 0;
849 tp->high_seq = tp->snd_nxt;
850 #endif
851 return;
852 case ICMP_PARAMETERPROB:
853 err = EPROTO;
854 break;
855 case ICMP_DEST_UNREACH:
856 if (code > NR_ICMP_UNREACH)
857 return;
859 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
860 do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
861 return;
864 err = icmp_err_convert[code].errno;
865 break;
866 case ICMP_TIME_EXCEEDED:
867 err = EHOSTUNREACH;
868 break;
869 default:
870 return;
873 switch (sk->state) {
874 struct open_request *req, *prev;
875 case TCP_LISTEN:
876 /* The final ACK of the handshake should be already
877 * handled in the new socket context, not here.
878 * Strictly speaking - an ICMP error for the final
879 * ACK should set the opening flag, but that is too
880 * complicated right now.
882 if (!no_flags && !th->syn && !th->ack)
883 return;
885 /* Prevent race conditions with accept() -
886 * ICMP is unreliable.
888 bh_lock_sock(sk);
889 if (sk->lock.users != 0) {
890 net_statistics.LockDroppedIcmps++;
891 /* If too many ICMPs get dropped on busy
892 * servers this needs to be solved differently.
894 goto out_unlock;
897 req = tcp_v4_search_req(tp, iph, th, &prev);
898 if (!req)
899 goto out_unlock;
900 if (seq != req->snt_isn) {
901 net_statistics.OutOfWindowIcmps++;
902 goto out_unlock;
904 if (req->sk) {
906 * Already in ESTABLISHED and a big socket is created,
907 * set error code there.
908 * The error will _not_ be reported in the accept(),
909 * but only with the next operation on the socket after
910 * accept.
912 bh_unlock_sock(sk);
913 sk = req->sk;
914 } else {
916 * Still in SYN_RECV, just remove it silently.
917 * There is no good way to pass the error to the newly
918 * created socket, and POSIX does not want network
919 * errors returned from accept().
921 tp->syn_backlog--;
922 tcp_synq_unlink(tp, req, prev);
923 req->class->destructor(req);
924 tcp_openreq_free(req);
925 out_unlock:
926 bh_unlock_sock(sk);
927 return;
929 break;
930 case TCP_SYN_SENT:
931 case TCP_SYN_RECV: /* Cannot happen */
932 if (!no_flags && !th->syn)
933 return;
934 tcp_statistics.TcpAttemptFails++;
935 sk->err = err;
936 sk->zapped = 1;
937 mb();
938 sk->error_report(sk);
939 return;
942 /* If we've already connected we will keep trying
943 * until we time out, or the user gives up.
945 * rfc1122 4.2.3.9 allows to consider as hard errors
946 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
947 * but it is obsoleted by pmtu discovery).
949 * Note, that in modern internet, where routing is unreliable
950 * and in each dark corner broken firewalls sit, sending random
951 * errors ordered by their masters even this two messages finally lose
952 * their original sense (even Linux sends invalid PORT_UNREACHs)
954 * Now we are in compliance with RFCs.
955 * --ANK (980905)
958 if (sk->ip_recverr) {
959 /* This code isn't serialized with the socket code */
960 /* ANK (980927) ... which is harmless now,
961 sk->err's may be safely lost.
963 sk->err = err;
964 mb();
965 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
966 } else { /* Only an error on timeout */
967 sk->err_soft = err;
968 mb();
972 /* This routine computes an IPv4 TCP checksum. */
973 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
974 struct sk_buff *skb)
976 th->check = 0;
977 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
978 csum_partial((char *)th, th->doff<<2, skb->csum));
982 * This routine will send an RST to the other tcp.
984 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
985 * for reset.
986 * Answer: if a packet caused RST, it is not for a socket
987 * existing in our system, if it is matched to a socket,
988 * it is just duplicate segment or bug in other side's TCP.
989 * So that we build reply only basing on parameters
990 * arrived with segment.
991 * Exception: precedence violation. We do not implement it in any case.
994 static void tcp_v4_send_reset(struct sk_buff *skb)
996 struct tcphdr *th = skb->h.th;
997 struct tcphdr rth;
998 struct ip_reply_arg arg;
1000 /* Never send a reset in response to a reset. */
1001 if (th->rst)
1002 return;
1004 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
1005 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1006 if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
1007 icmp_send(skb, ICMP_DEST_UNREACH,
1008 ICMP_PORT_UNREACH, 0);
1009 #endif
1010 return;
1013 /* Swap the send and the receive. */
1014 memset(&rth, 0, sizeof(struct tcphdr));
1015 rth.dest = th->source;
1016 rth.source = th->dest;
1017 rth.doff = sizeof(struct tcphdr)/4;
1018 rth.rst = 1;
1020 if (th->ack) {
1021 rth.seq = th->ack_seq;
1022 } else {
1023 rth.ack = 1;
1024 rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
1027 memset(&arg, 0, sizeof arg);
1028 arg.iov[0].iov_base = (unsigned char *)&rth;
1029 arg.iov[0].iov_len = sizeof rth;
1030 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1031 skb->nh.iph->saddr, /*XXX*/
1032 sizeof(struct tcphdr),
1033 IPPROTO_TCP,
1034 0);
1035 arg.n_iov = 1;
1036 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1038 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1040 tcp_statistics.TcpOutSegs++;
1041 tcp_statistics.TcpOutRsts++;
1044 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1047 Seems, I never wrote nothing more stupid.
1048 I hope Gods will forgive me, but I cannot forgive myself 8)
1049 --ANK (981001)
1052 static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
1054 struct iphdr *iph = skb->nh.iph;
1055 struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1056 struct sock *sk = NULL;
1057 int i;
1059 SOCKHASH_LOCK_READ();
1060 for (i=0; i<TCP_LHTABLE_SIZE; i++) {
1061 for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
1062 struct open_request *dummy;
1063 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
1064 th, &dummy) &&
1065 (!sk->bound_dev_if ||
1066 sk->bound_dev_if == skb->dev->ifindex))
1067 goto out;
1070 out:
1071 SOCKHASH_UNLOCK_READ();
1072 return sk;
1076 * Check whether a received TCP packet might be for one of our
1077 * connections.
1080 int tcp_chkaddr(struct sk_buff *skb)
1082 struct iphdr *iph = skb->nh.iph;
1083 struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1084 struct sock *sk;
1086 sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
1087 th->dest, skb->dev->ifindex);
1089 if (!sk)
1090 return tcp_v4_search_proxy_openreq(skb) != NULL;
1092 if (sk->state == TCP_LISTEN) {
1093 struct open_request *dummy;
1094 if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
1095 th, &dummy) &&
1096 (!sk->bound_dev_if ||
1097 sk->bound_dev_if == skb->dev->ifindex))
1098 return 1;
1101 /* 0 means accept all LOCAL addresses here, not all the world... */
1103 if (sk->rcv_saddr == 0)
1104 return 0;
1106 return 1;
1108 #endif
1111 * Send a SYN-ACK after having received an ACK.
1112 * This still operates on a open_request only, not on a big
1113 * socket.
1115 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1117 struct rtable *rt;
1118 struct ip_options *opt;
1119 struct sk_buff * skb;
1120 int mss;
1122 /* First, grab a route. */
1123 opt = req->af.v4_req.opt;
1124 if(ip_route_output(&rt, ((opt && opt->srr) ?
1125 opt->faddr :
1126 req->af.v4_req.rmt_addr),
1127 req->af.v4_req.loc_addr,
1128 RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
1129 sk->bound_dev_if)) {
1130 ip_statistics.IpOutNoRoutes++;
1131 return;
1133 if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1134 ip_rt_put(rt);
1135 ip_statistics.IpOutNoRoutes++;
1136 return;
1139 mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1141 skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
1142 if (skb) {
1143 struct tcphdr *th = skb->h.th;
1145 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1146 th->source = req->lcl_port; /* LVE */
1147 #endif
1149 th->check = tcp_v4_check(th, skb->len,
1150 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1151 csum_partial((char *)th, skb->len, skb->csum));
1153 ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1154 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1156 ip_rt_put(rt);
1160 * IPv4 open_request destructor.
1162 static void tcp_v4_or_free(struct open_request *req)
1164 if(!req->sk && req->af.v4_req.opt)
1165 kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
1168 static inline void syn_flood_warning(struct sk_buff *skb)
1170 static unsigned long warntime;
1172 if (jiffies - warntime > HZ*60) {
1173 warntime = jiffies;
1174 printk(KERN_INFO
1175 "possible SYN flooding on port %d. Sending cookies.\n",
1176 ntohs(skb->h.th->dest));
1181 * Save and compile IPv4 options into the open_request if needed.
1183 static inline struct ip_options *
1184 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1186 struct ip_options *opt = &(IPCB(skb)->opt);
1187 struct ip_options *dopt = NULL;
1189 if (opt && opt->optlen) {
1190 int opt_size = optlength(opt);
1191 dopt = kmalloc(opt_size, GFP_ATOMIC);
1192 if (dopt) {
1193 if (ip_options_echo(dopt, skb)) {
1194 kfree_s(dopt, opt_size);
1195 dopt = NULL;
1199 return dopt;
1203 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1204 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1205 * It would be better to replace it with a global counter for all sockets
1206 * but then some measure against one socket starving all other sockets
1207 * would be needed.
1209 int sysctl_max_syn_backlog = 128;
1211 struct or_calltable or_ipv4 = {
1212 tcp_v4_send_synack,
1213 tcp_v4_or_free,
1214 tcp_v4_send_reset
1217 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1218 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1220 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
1222 struct tcp_opt tp;
1223 struct open_request *req;
1224 struct tcphdr *th = skb->h.th;
1225 __u32 saddr = skb->nh.iph->saddr;
1226 __u32 daddr = skb->nh.iph->daddr;
1227 #ifdef CONFIG_SYN_COOKIES
1228 int want_cookie = 0;
1229 #else
1230 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1231 #endif
1233 /* If the socket is dead, don't accept the connection. */
1234 if (sk->dead)
1235 goto dead;
1237 /* Never answer to SYNs send to broadcast or multicast */
1238 if (((struct rtable *)skb->dst)->rt_flags &
1239 (RTCF_BROADCAST|RTCF_MULTICAST))
1240 goto drop;
1242 /* XXX: Check against a global syn pool counter. */
1243 if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1244 #ifdef CONFIG_SYN_COOKIES
1245 if (sysctl_tcp_syncookies) {
1246 syn_flood_warning(skb);
1247 want_cookie = 1;
1248 } else
1249 #endif
1250 goto drop;
1251 } else {
1252 if (isn == 0)
1253 isn = tcp_v4_init_sequence(sk, skb);
1254 BACKLOG(sk)++;
1257 req = tcp_openreq_alloc();
1258 if (req == NULL) {
1259 goto dropbacklog;
1262 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1264 req->rcv_isn = TCP_SKB_CB(skb)->seq;
1265 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1267 tp.mss_clamp = 65535;
1268 tcp_parse_options(NULL, th, &tp, want_cookie);
1269 if (tp.mss_clamp == 65535)
1270 tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
1272 if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
1273 tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
1274 req->mss = tp.mss_clamp;
1276 if (tp.saw_tstamp)
1277 req->ts_recent = tp.rcv_tsval;
1278 req->tstamp_ok = tp.tstamp_ok;
1279 req->sack_ok = tp.sack_ok;
1280 req->snd_wscale = tp.snd_wscale;
1281 req->wscale_ok = tp.wscale_ok;
1282 req->rmt_port = th->source;
1283 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1284 req->lcl_port = th->dest ; /* LVE */
1285 #endif
1286 req->af.v4_req.loc_addr = daddr;
1287 req->af.v4_req.rmt_addr = saddr;
1289 /* Note that we ignore the isn passed from the TIME_WAIT
1290 * state here. That's the price we pay for cookies.
1292 if (want_cookie)
1293 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1295 req->snt_isn = isn;
1297 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1299 req->class = &or_ipv4;
1300 req->retrans = 0;
1301 req->sk = NULL;
1303 tcp_v4_send_synack(sk, req);
1305 if (want_cookie) {
1306 if (req->af.v4_req.opt)
1307 kfree(req->af.v4_req.opt);
1308 tcp_v4_or_free(req);
1309 tcp_openreq_free(req);
1310 } else {
1311 req->expires = jiffies + TCP_TIMEOUT_INIT;
1312 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1313 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1316 return 0;
1318 dead:
1319 SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1320 tcp_statistics.TcpAttemptFails++;
1321 return -ENOTCONN; /* send reset */
1323 dropbacklog:
1324 if (!want_cookie)
1325 BACKLOG(sk)--;
1326 drop:
1327 tcp_statistics.TcpAttemptFails++;
1328 return 0;
1331 /* This is not only more efficient than what we used to do, it eliminates
1332 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
1334 * This function wants to be moved to a common for IPv[46] file. --ANK
1336 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
1338 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
1340 if(newsk != NULL) {
1341 struct tcp_opt *newtp;
1342 #ifdef CONFIG_FILTER
1343 struct sk_filter *filter;
1344 #endif
1346 memcpy(newsk, sk, sizeof(*newsk));
1347 newsk->sklist_next = NULL;
1348 newsk->state = TCP_SYN_RECV;
1350 /* Clone the TCP header template */
1351 newsk->dport = req->rmt_port;
1353 sock_lock_init(newsk);
1355 atomic_set(&newsk->rmem_alloc, 0);
1356 skb_queue_head_init(&newsk->receive_queue);
1357 atomic_set(&newsk->wmem_alloc, 0);
1358 skb_queue_head_init(&newsk->write_queue);
1359 atomic_set(&newsk->omem_alloc, 0);
1361 newsk->done = 0;
1362 newsk->proc = 0;
1363 newsk->pair = NULL;
1364 newsk->backlog.head = newsk->backlog.tail = NULL;
1365 skb_queue_head_init(&newsk->error_queue);
1366 #ifdef CONFIG_FILTER
1367 if ((filter = newsk->filter) != NULL)
1368 sk_filter_charge(newsk, filter);
1369 #endif
1371 /* Now setup tcp_opt */
1372 newtp = &(newsk->tp_pinfo.af_tcp);
1373 newtp->pred_flags = 0;
1374 newtp->rcv_nxt = req->rcv_isn + 1;
1375 newtp->snd_nxt = req->snt_isn + 1;
1376 newtp->snd_una = req->snt_isn + 1;
1377 newtp->srtt = 0;
1378 newtp->ato = 0;
1379 newtp->snd_wl1 = req->rcv_isn;
1380 newtp->snd_wl2 = req->snt_isn;
1382 /* RFC1323: The window in SYN & SYN/ACK segments
1383 * is never scaled.
1385 newtp->snd_wnd = ntohs(skb->h.th->window);
1387 newtp->max_window = newtp->snd_wnd;
1388 newtp->pending = 0;
1389 newtp->retransmits = 0;
1390 newtp->last_ack_sent = req->rcv_isn + 1;
1391 newtp->backoff = 0;
1392 newtp->mdev = TCP_TIMEOUT_INIT;
1394 /* So many TCP implementations out there (incorrectly) count the
1395 * initial SYN frame in their delayed-ACK and congestion control
1396 * algorithms that we must have the following bandaid to talk
1397 * efficiently to them. -DaveM
1399 newtp->snd_cwnd = 2;
1401 newtp->rto = TCP_TIMEOUT_INIT;
1402 newtp->packets_out = 0;
1403 newtp->fackets_out = 0;
1404 newtp->retrans_out = 0;
1405 newtp->high_seq = 0;
1406 newtp->snd_ssthresh = 0x7fffffff;
1407 newtp->snd_cwnd_cnt = 0;
1408 newtp->dup_acks = 0;
1409 newtp->delayed_acks = 0;
1410 init_timer(&newtp->retransmit_timer);
1411 newtp->retransmit_timer.function = &tcp_retransmit_timer;
1412 newtp->retransmit_timer.data = (unsigned long) newsk;
1413 init_timer(&newtp->delack_timer);
1414 newtp->delack_timer.function = &tcp_delack_timer;
1415 newtp->delack_timer.data = (unsigned long) newsk;
1416 skb_queue_head_init(&newtp->out_of_order_queue);
1417 newtp->send_head = newtp->retrans_head = NULL;
1418 newtp->rcv_wup = req->rcv_isn + 1;
1419 newtp->write_seq = req->snt_isn + 1;
1420 newtp->copied_seq = req->rcv_isn + 1;
1422 newtp->saw_tstamp = 0;
1423 newtp->mss_clamp = req->mss;
1425 init_timer(&newtp->probe_timer);
1426 newtp->probe_timer.function = &tcp_probe_timer;
1427 newtp->probe_timer.data = (unsigned long) newsk;
1428 newtp->probes_out = 0;
1429 newtp->syn_seq = req->rcv_isn;
1430 newtp->fin_seq = req->rcv_isn;
1431 newtp->urg_data = 0;
1432 tcp_synq_init(newtp);
1433 newtp->syn_backlog = 0;
1434 if (skb->len >= 536)
1435 newtp->last_seg_size = skb->len;
1437 /* Back to base struct sock members. */
1438 newsk->err = 0;
1439 newsk->ack_backlog = 0;
1440 newsk->max_ack_backlog = SOMAXCONN;
1441 newsk->priority = 0;
1443 /* IP layer stuff */
1444 newsk->timeout = 0;
1445 init_timer(&newsk->timer);
1446 newsk->timer.function = &net_timer;
1447 newsk->timer.data = (unsigned long) newsk;
1448 newsk->socket = NULL;
1450 newtp->tstamp_ok = req->tstamp_ok;
1451 if((newtp->sack_ok = req->sack_ok) != 0)
1452 newtp->num_sacks = 0;
1453 newtp->window_clamp = req->window_clamp;
1454 newtp->rcv_wnd = req->rcv_wnd;
1455 newtp->wscale_ok = req->wscale_ok;
1456 if (newtp->wscale_ok) {
1457 newtp->snd_wscale = req->snd_wscale;
1458 newtp->rcv_wscale = req->rcv_wscale;
1459 } else {
1460 newtp->snd_wscale = newtp->rcv_wscale = 0;
1461 newtp->window_clamp = min(newtp->window_clamp,65535);
1463 if (newtp->tstamp_ok) {
1464 newtp->ts_recent = req->ts_recent;
1465 newtp->ts_recent_stamp = tcp_time_stamp;
1466 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
1467 } else {
1468 newtp->tcp_header_len = sizeof(struct tcphdr);
1471 return newsk;
1475 * The three way handshake has completed - we got a valid synack -
1476 * now create the new socket.
1478 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1479 struct open_request *req,
1480 struct dst_entry *dst)
1482 struct ip_options *opt = req->af.v4_req.opt;
1483 struct tcp_opt *newtp;
1484 struct sock *newsk;
1486 if (sk->ack_backlog > sk->max_ack_backlog)
1487 goto exit; /* head drop */
1488 if (dst == NULL) {
1489 struct rtable *rt;
1491 if (ip_route_output(&rt,
1492 opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
1493 req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
1494 return NULL;
1495 dst = &rt->u.dst;
1497 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1498 /* The new socket created for transparent proxy may fall
1499 * into a non-existed bind bucket because sk->num != newsk->num.
1500 * Ensure existance of the bucket now. The placement of the check
1501 * later will require to destroy just created newsk in the case of fail.
1502 * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
1504 if (tcp_bucket_check(ntohs(skb->h.th->dest)))
1505 goto exit;
1506 #endif
1508 newsk = tcp_create_openreq_child(sk, req, skb);
1509 if (!newsk)
1510 goto exit;
1512 sk->tp_pinfo.af_tcp.syn_backlog--;
1513 sk->ack_backlog++;
1515 newsk->dst_cache = dst;
1517 newtp = &(newsk->tp_pinfo.af_tcp);
1518 newsk->daddr = req->af.v4_req.rmt_addr;
1519 newsk->saddr = req->af.v4_req.loc_addr;
1520 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1521 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1522 newsk->num = ntohs(skb->h.th->dest);
1523 newsk->sport = req->lcl_port;
1524 #endif
1525 newsk->opt = req->af.v4_req.opt;
1526 newtp->ext_header_len = 0;
1527 if (newsk->opt)
1528 newtp->ext_header_len = newsk->opt->optlen;
1530 tcp_sync_mss(newsk, dst->pmtu);
1531 newtp->rcv_mss = newtp->mss_clamp;
1533 /* It would be better to use newtp->mss_clamp here */
1534 if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
1535 newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
1536 if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
1537 newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
1539 tcp_v4_hash(newsk);
1540 add_to_prot_sklist(newsk);
1541 sk->data_ready(sk, 0); /* Deliver SIGIO */
1543 return newsk;
1545 exit:
1546 dst_release(dst);
1547 return NULL;
1550 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1552 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1553 struct open_request *req, *prev;
1555 req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1556 if (!req)
1557 return;
1558 /* Sequence number check required by RFC793 */
1559 if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
1560 after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1561 return;
1562 tcp_synq_unlink(tp, req, prev);
1563 (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
1564 req->class->destructor(req);
1565 tcp_openreq_free(req);
1567 net_statistics.EmbryonicRsts++;
1570 /* Check for embryonic sockets (open_requests) We check packets with
1571 * only the SYN bit set against the open_request queue too: This
1572 * increases connection latency a bit, but is required to detect
1573 * retransmitted SYNs.
1575 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1577 struct tcphdr *th = skb->h.th;
1578 u32 flg = ((u32 *)th)[3];
1580 /* Check for RST */
1581 if (flg & __constant_htonl(0x00040000)) {
1582 tcp_v4_rst_req(sk, skb);
1583 return NULL;
1586 /* Check for SYN|ACK */
1587 if (flg & __constant_htonl(0x00120000)) {
1588 struct open_request *req, *dummy;
1589 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1591 /* Find possible connection requests. */
1592 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1593 if (req) {
1594 sk = tcp_check_req(sk, skb, req);
1596 #ifdef CONFIG_SYN_COOKIES
1597 else {
1598 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1600 #endif
1602 return sk;
1605 /* The socket must have it's spinlock held when we get
1606 * here.
1608 * We have a potential double-lock case here, so even when
1609 * doing backlog processing we use the BH locking scheme.
1610 * This is because we cannot sleep with the original spinlock
1611 * held.
1613 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1615 int need_unlock = 0;
1616 #ifdef CONFIG_FILTER
1617 struct sk_filter *filter = sk->filter;
1618 if (filter && sk_filter(skb, filter))
1619 goto discard;
1620 #endif /* CONFIG_FILTER */
1623 * This doesn't check if the socket has enough room for the packet.
1624 * Either process the packet _without_ queueing it and then free it,
1625 * or do the check later.
1627 skb_set_owner_r(skb, sk);
1629 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1630 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1631 goto reset;
1632 return 0;
1635 if (sk->state == TCP_LISTEN) {
1636 struct sock *nsk;
1638 nsk = tcp_v4_hnd_req(sk, skb);
1639 if (!nsk)
1640 goto discard;
1643 * Queue it on the new socket if the new socket is active,
1644 * otherwise we just shortcircuit this and continue with
1645 * the new socket..
1647 if (nsk != sk) {
1648 bh_lock_sock(nsk);
1649 if (nsk->lock.users != 0) {
1650 skb_orphan(skb);
1651 sk_add_backlog(nsk, skb);
1652 bh_unlock_sock(nsk);
1653 return 0;
1655 need_unlock = 1;
1656 sk = nsk;
1660 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1661 goto reset;
1662 goto out_maybe_unlock;
1664 reset:
1665 tcp_v4_send_reset(skb);
1666 discard:
1667 kfree_skb(skb);
1668 /* Be careful here. If this function gets more complicated and
1669 * gcc suffers from register pressure on the x86, sk (in %ebx)
1670 * might be destroyed here. This current version compiles correctly,
1671 * but you have been warned.
1673 out_maybe_unlock:
1674 if(need_unlock)
1675 bh_unlock_sock(sk);
1676 return 0;
1680 * From tcp_input.c
1683 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1685 struct tcphdr *th;
1686 struct sock *sk;
1687 int ret;
1689 if (skb->pkt_type!=PACKET_HOST)
1690 goto discard_it;
1692 th = skb->h.th;
1694 /* Pull up the IP header. */
1695 __skb_pull(skb, skb->h.raw - skb->data);
1697 /* Count it even if it's bad */
1698 tcp_statistics.TcpInSegs++;
1700 if (len < sizeof(struct tcphdr))
1701 goto bad_packet;
1703 /* Try to use the device checksum if provided. */
1704 switch (skb->ip_summed) {
1705 case CHECKSUM_NONE:
1706 skb->csum = csum_partial((char *)th, len, 0);
1707 case CHECKSUM_HW:
1708 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1709 NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
1710 "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
1711 "len=%d/%d/%d\n",
1712 NIPQUAD(skb->nh.iph->saddr),
1713 ntohs(th->source),
1714 NIPQUAD(skb->nh.iph->daddr),
1715 ntohs(th->dest),
1716 len, skb->len,
1717 ntohs(skb->nh.iph->tot_len)));
1718 bad_packet:
1719 tcp_statistics.TcpInErrs++;
1720 goto discard_it;
1722 default:
1723 /* CHECKSUM_UNNECESSARY */
1726 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1727 if (IPCB(skb)->redirport)
1728 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1729 skb->nh.iph->daddr, skb->dev,
1730 IPCB(skb)->redirport, skb->dev->ifindex);
1731 else {
1732 #endif
1733 SOCKHASH_LOCK_READ_BH();
1734 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1735 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1736 SOCKHASH_UNLOCK_READ_BH();
1737 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1738 if (!sk)
1739 sk = tcp_v4_search_proxy_openreq(skb);
1741 #endif
1742 if (!sk)
1743 goto no_tcp_socket;
1744 if(!ipsec_sk_policy(sk,skb))
1745 goto discard_it;
1747 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1748 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1749 len - th->doff*4);
1750 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1752 skb->used = 0;
1754 if (sk->state == TCP_TIME_WAIT)
1755 goto do_time_wait;
1757 bh_lock_sock(sk);
1758 ret = 0;
1759 if (!sk->lock.users)
1760 ret = tcp_v4_do_rcv(sk, skb);
1761 else
1762 sk_add_backlog(sk, skb);
1763 bh_unlock_sock(sk);
1765 return ret;
1767 no_tcp_socket:
1768 tcp_v4_send_reset(skb);
1770 discard_it:
1771 /* Discard frame. */
1772 kfree_skb(skb);
1773 return 0;
1775 do_time_wait:
1776 if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1777 skb, th, skb->len))
1778 goto no_tcp_socket;
1779 goto discard_it;
1782 int tcp_v4_rebuild_header(struct sock *sk)
1784 struct rtable *rt = (struct rtable *)sk->dst_cache;
1785 __u32 new_saddr;
1786 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1788 if(rt == NULL)
1789 return 0;
1791 /* Force route checking if want_rewrite.
1792 * The idea is good, the implementation is disguisting.
1793 * Well, if I made bind on this socket, you cannot randomly ovewrite
1794 * its source address. --ANK
1796 if (want_rewrite) {
1797 int tmp;
1798 struct rtable *new_rt;
1799 __u32 old_saddr = rt->rt_src;
1801 /* Query new route using another rt buffer */
1802 tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
1803 RT_TOS(sk->ip_tos)|sk->localroute,
1804 sk->bound_dev_if);
1806 /* Only useful if different source addrs */
1807 if (tmp == 0) {
1809 * Only useful if different source addrs
1811 if (new_rt->rt_src != old_saddr ) {
1812 dst_release(sk->dst_cache);
1813 sk->dst_cache = &new_rt->u.dst;
1814 rt = new_rt;
1815 goto do_rewrite;
1817 dst_release(&new_rt->u.dst);
1820 if (rt->u.dst.obsolete) {
1821 int err;
1822 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
1823 if (err) {
1824 sk->err_soft=-err;
1825 sk->error_report(sk);
1826 return -1;
1828 dst_release(xchg(&sk->dst_cache, &rt->u.dst));
1831 return 0;
1833 do_rewrite:
1834 new_saddr = rt->rt_src;
1836 /* Ouch!, this should not happen. */
1837 if (!sk->saddr || !sk->rcv_saddr) {
1838 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
1839 "saddr=%08lX rcv_saddr=%08lX\n",
1840 ntohl(sk->saddr),
1841 ntohl(sk->rcv_saddr));
1842 return 0;
1845 if (new_saddr != sk->saddr) {
1846 if (sysctl_ip_dynaddr > 1) {
1847 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1848 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1849 NIPQUAD(sk->saddr),
1850 NIPQUAD(new_saddr));
1853 sk->saddr = new_saddr;
1854 sk->rcv_saddr = new_saddr;
1855 tcp_v4_rehash(sk);
1858 return 0;
1861 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1863 return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1864 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1867 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1869 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1871 sin->sin_family = AF_INET;
1872 sin->sin_addr.s_addr = sk->daddr;
1873 sin->sin_port = sk->dport;
1876 struct tcp_func ipv4_specific = {
1877 ip_queue_xmit,
1878 tcp_v4_send_check,
1879 tcp_v4_rebuild_header,
1880 tcp_v4_conn_request,
1881 tcp_v4_syn_recv_sock,
1882 tcp_v4_get_sock,
1883 sizeof(struct iphdr),
1885 ip_setsockopt,
1886 ip_getsockopt,
1887 v4_addr2sockaddr,
1888 sizeof(struct sockaddr_in)
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892 * sk_alloc() so need not be done here.
1894 static int tcp_v4_init_sock(struct sock *sk)
1896 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1898 skb_queue_head_init(&tp->out_of_order_queue);
1899 tcp_init_xmit_timers(sk);
1901 tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
1902 tp->mdev = TCP_TIMEOUT_INIT;
1903 tp->mss_clamp = ~0;
1905 /* So many TCP implementations out there (incorrectly) count the
1906 * initial SYN frame in their delayed-ACK and congestion control
1907 * algorithms that we must have the following bandaid to talk
1908 * efficiently to them. -DaveM
1910 tp->snd_cwnd = 2;
1912 /* See draft-stevens-tcpca-spec-01 for discussion of the
1913 * initialization of these values.
1915 tp->snd_cwnd_cnt = 0;
1916 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1918 sk->state = TCP_CLOSE;
1919 sk->max_ack_backlog = SOMAXCONN;
1920 tp->rcv_mss = 536;
1922 sk->write_space = tcp_write_space;
1924 /* Init SYN queue. */
1925 tcp_synq_init(tp);
1927 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1929 return 0;
1932 static int tcp_v4_destroy_sock(struct sock *sk)
1934 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1935 struct sk_buff *skb;
1937 tcp_clear_xmit_timers(sk);
1939 if (sk->keepopen)
1940 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1942 /* Cleanup up the write buffer. */
1943 while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
1944 kfree_skb(skb);
1946 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1947 while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
1948 kfree_skb(skb);
1950 /* Clean up a locked TCP bind bucket, this only happens if a
1951 * port is allocated for a socket, but it never fully connects.
1952 * In which case we will find num to be non-zero and daddr to
1953 * be zero.
1955 if(sk->daddr == 0 && sk->num != 0)
1956 tcp_bucket_unlock(sk);
1958 return 0;
1961 struct proto tcp_prot = {
1962 (struct sock *)&tcp_prot, /* sklist_next */
1963 (struct sock *)&tcp_prot, /* sklist_prev */
1964 tcp_close, /* close */
1965 tcp_v4_connect, /* connect */
1966 tcp_accept, /* accept */
1967 NULL, /* retransmit */
1968 tcp_write_wakeup, /* write_wakeup */
1969 tcp_read_wakeup, /* read_wakeup */
1970 tcp_poll, /* poll */
1971 tcp_ioctl, /* ioctl */
1972 tcp_v4_init_sock, /* init */
1973 tcp_v4_destroy_sock, /* destroy */
1974 tcp_shutdown, /* shutdown */
1975 tcp_setsockopt, /* setsockopt */
1976 tcp_getsockopt, /* getsockopt */
1977 tcp_v4_sendmsg, /* sendmsg */
1978 tcp_recvmsg, /* recvmsg */
1979 NULL, /* bind */
1980 tcp_v4_do_rcv, /* backlog_rcv */
1981 tcp_v4_hash, /* hash */
1982 tcp_v4_unhash, /* unhash */
1983 tcp_v4_rehash, /* rehash */
1984 tcp_good_socknum, /* good_socknum */
1985 tcp_v4_verify_bind, /* verify_bind */
1986 128, /* max_header */
1987 0, /* retransmits */
1988 "TCP", /* name */
1989 0, /* inuse */
1990 0 /* highestinuse */
1995 __initfunc(void tcp_v4_init(struct net_proto_family *ops))
1997 int err;
1999 tcp_inode.i_mode = S_IFSOCK;
2000 tcp_inode.i_sock = 1;
2001 tcp_inode.i_uid = 0;
2002 tcp_inode.i_gid = 0;
2003 init_waitqueue_head(&tcp_inode.i_wait);
2004 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2006 tcp_socket->inode = &tcp_inode;
2007 tcp_socket->state = SS_UNCONNECTED;
2008 tcp_socket->type=SOCK_RAW;
2010 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2011 panic("Failed to create the TCP control socket.\n");
2012 tcp_socket->sk->allocation=GFP_ATOMIC;
2013 tcp_socket->sk->num = 256; /* Don't receive any data */
2014 tcp_socket->sk->ip_ttl = MAXTTL;