Import 2.1.77
[davej-history.git] / net / ipv4 / tcp_ipv4.c
bloba6c465e59c86f77186b5ee85ff21264156638e55
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.76 1997/12/07 04:44:19 freitag Exp $
10 * IPv4 specific functions
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics (ifdefed by
42 * NEW_LISTEN for now)
43 * Juan Jose Ciarlante: ip_dynaddr bits
46 #include <linux/config.h>
47 #include <linux/types.h>
48 #include <linux/fcntl.h>
49 #include <linux/random.h>
50 #include <linux/ipsec.h>
51 #include <linux/inet.h>
53 #include <net/icmp.h>
54 #include <net/tcp.h>
55 #include <net/ipv6.h>
57 #include <asm/segment.h>
59 extern int sysctl_tcp_sack;
60 extern int sysctl_tcp_tsack;
61 extern int sysctl_tcp_timestamps;
62 extern int sysctl_tcp_window_scaling;
63 extern int sysctl_tcp_syncookies;
64 extern int sysctl_ip_dynaddr;
66 /* Check TCP sequence numbers in ICMP packets. */
67 #define ICMP_PARANOIA 1
68 #ifndef ICMP_PARANOIA
69 #define ICMP_MIN_LENGTH 4
70 #else
71 #define ICMP_MIN_LENGTH 8
72 #endif
74 static void tcp_v4_send_reset(struct sk_buff *skb);
76 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
77 struct sk_buff *skb);
79 /* This is for sockets with full identity only. Sockets here will always
80 * be without wildcards and will have the following invariant:
81 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
83 * First half of the table is for sockets not in TIME_WAIT, second half
84 * is for TIME_WAIT sockets only.
86 struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
88 /* All sockets in TCP_LISTEN state will be in here. This is the only table
89 * where wildcard'd TCP sockets can exist. Hash function here is just local
90 * port number.
92 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
94 /* Ok, let's try this, I give up, we do need a local binding
95 * TCP hash as well as the others for fast bind/connect.
97 struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
104 int sysctl_local_port_range[2] = { 1024, 4999 };
106 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
107 __u32 faddr, __u16 fport)
109 return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
112 static __inline__ int tcp_sk_hashfn(struct sock *sk)
114 __u32 laddr = sk->rcv_saddr;
115 __u16 lport = sk->num;
116 __u32 faddr = sk->daddr;
117 __u16 fport = sk->dummy_th.dest;
119 return tcp_hashfn(laddr, lport, faddr, fport);
122 static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
124 struct sock *sk2;
125 int retval = 0, sk_reuse = sk->reuse;
127 SOCKHASH_LOCK();
128 sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
129 for(; sk2 != NULL; sk2 = sk2->bind_next) {
130 if((sk2->num == snum) && (sk2 != sk)) {
131 unsigned char state = sk2->state;
132 int sk2_reuse = sk2->reuse;
134 /* Two sockets can be bound to the same port if they're
135 * bound to different interfaces.
138 if(sk->bound_dev_if != sk2->bound_dev_if)
139 continue;
141 if(!sk2->rcv_saddr || !sk->rcv_saddr) {
142 if((!sk2_reuse) ||
143 (!sk_reuse) ||
144 (state == TCP_LISTEN)) {
145 retval = 1;
146 break;
148 } else if(sk2->rcv_saddr == sk->rcv_saddr) {
149 if((!sk_reuse) ||
150 (!sk2_reuse) ||
151 (state == TCP_LISTEN)) {
152 retval = 1;
153 break;
158 SOCKHASH_UNLOCK();
160 return retval;
163 static __inline__ int tcp_lport_inuse(int num)
165 struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
167 for(; sk != NULL; sk = sk->bind_next) {
168 if(sk->num == num)
169 return 1;
171 return 0;
174 /* Find a "good" local port, this is family independant.
175 * There are several strategies working in unison here to
176 * get the best possible performance. The current socket
177 * load is kept track of, if it is zero there is a strong
178 * likely hood that there is a zero length chain we will
179 * find with a small amount of searching, else the load is
180 * what we shoot for for when the chains all have at least
181 * one entry. The base helps us walk the chains in an
182 * order such that a good chain is found as quickly as possible. -DaveM
184 unsigned short tcp_good_socknum(void)
186 static int start = 0;
187 static int binding_contour = 0;
188 int best = 0;
189 int size = 32767; /* a big num. */
190 int retval = 0, i, end, bc;
192 SOCKHASH_LOCK();
193 if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
194 start = sysctl_local_port_range[0];
195 i = tcp_bhashfn(start);
196 end = i + TCP_BHTABLE_SIZE;
197 bc = binding_contour;
198 do {
199 struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
200 if(!sk) {
201 /* find the smallest value no smaller than start
202 * that has this hash value.
204 retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
206 /* Check for decreasing load. */
207 if (bc != 0)
208 binding_contour = 0;
209 goto done;
210 } else {
211 int j = 0;
212 do { sk = sk->bind_next; } while (++j < size && sk);
213 if (j < size) {
214 best = i&(TCP_BHTABLE_SIZE-1);
215 size = j;
216 if (bc && size <= bc)
217 goto verify;
220 } while(++i != end);
221 i = best;
223 /* Socket load is increasing, adjust our load average. */
224 binding_contour = size;
225 verify:
226 if (size < binding_contour)
227 binding_contour = size;
229 retval = tcp_bhashnext(start-1,i);
231 best = retval; /* mark the starting point to avoid infinite loops */
232 while(tcp_lport_inuse(retval)) {
233 retval = tcp_bhashnext(retval,i);
234 if (retval > sysctl_local_port_range[1]) /* Upper bound */
235 retval = tcp_bhashnext(sysctl_local_port_range[0],i);
236 if (retval == best) {
237 /* This hash chain is full. No answer. */
238 retval = 0;
239 break;
243 done:
244 start = (retval + 1);
245 SOCKHASH_UNLOCK();
247 return retval;
250 static void tcp_v4_hash(struct sock *sk)
252 unsigned char state;
254 SOCKHASH_LOCK();
255 state = sk->state;
256 if(state != TCP_CLOSE || !sk->dead) {
257 struct sock **skp;
259 if(state == TCP_LISTEN)
260 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
261 else
262 skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
264 if((sk->next = *skp) != NULL)
265 (*skp)->pprev = &sk->next;
266 *skp = sk;
267 sk->pprev = skp;
268 tcp_sk_bindify(sk);
270 SOCKHASH_UNLOCK();
273 static void tcp_v4_unhash(struct sock *sk)
275 SOCKHASH_LOCK();
276 if(sk->pprev) {
277 if(sk->next)
278 sk->next->pprev = sk->pprev;
279 *sk->pprev = sk->next;
280 sk->pprev = NULL;
281 tcp_sk_unbindify(sk);
283 SOCKHASH_UNLOCK();
286 static void tcp_v4_rehash(struct sock *sk)
288 unsigned char state;
290 SOCKHASH_LOCK();
291 state = sk->state;
292 if(sk->pprev) {
293 if(sk->next)
294 sk->next->pprev = sk->pprev;
295 *sk->pprev = sk->next;
296 sk->pprev = NULL;
297 tcp_sk_unbindify(sk);
299 if(state != TCP_CLOSE || !sk->dead) {
300 struct sock **skp;
302 if(state == TCP_LISTEN) {
303 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
304 } else {
305 int hash= tcp_sk_hashfn(sk);
306 if(state == TCP_TIME_WAIT)
307 hash += (TCP_HTABLE_SIZE/2);
308 skp = &tcp_established_hash[hash];
311 if((sk->next = *skp) != NULL)
312 (*skp)->pprev = &sk->next;
313 *skp = sk;
314 sk->pprev = skp;
315 tcp_sk_bindify(sk);
317 SOCKHASH_UNLOCK();
320 /* Don't inline this cruft. Here are some nice properties to
321 * exploit here. The BSD API does not allow a listening TCP
322 * to specify the remote port nor the remote address for the
323 * connection. So always assume those are both wildcarded
324 * during the search since they can never be otherwise.
326 static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
328 struct sock *sk;
329 struct sock *result = NULL;
330 int score, hiscore;
332 hiscore=0;
333 for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
334 if(sk->num == hnum) {
335 __u32 rcv_saddr = sk->rcv_saddr;
337 score = 1;
338 if(rcv_saddr) {
339 if (rcv_saddr != daddr)
340 continue;
341 score++;
343 if (sk->bound_dev_if) {
344 if (sk->bound_dev_if != dif)
345 continue;
346 score++;
348 if (score == 3)
349 return sk;
350 if (score > hiscore) {
351 hiscore = score;
352 result = sk;
356 return result;
359 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
360 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
362 static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
363 u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
365 unsigned short hnum = ntohs(dport);
366 struct sock *sk;
367 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
369 /* Optimize here for direct hit, only listening connections can
370 * have wildcards anyways. It is assumed that this code only
371 * gets called from within NET_BH.
373 for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
374 if(sk->daddr == saddr && /* remote address */
375 sk->dummy_th.dest == sport && /* remote port */
376 sk->num == hnum && /* local port */
377 sk->rcv_saddr == daddr && /* local address */
378 (!sk->bound_dev_if || sk->bound_dev_if == dif))
379 goto hit; /* You sunk my battleship! */
381 /* Must check for a TIME_WAIT'er before going to listener hash. */
382 for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
383 if(sk->daddr == saddr && /* remote address */
384 sk->dummy_th.dest == sport && /* remote port */
385 sk->num == hnum && /* local port */
386 sk->rcv_saddr == daddr && /* local address */
387 (!sk->bound_dev_if || sk->bound_dev_if == dif))
388 goto hit;
390 sk = tcp_v4_lookup_listener(daddr, hnum, dif);
391 hit:
392 return sk;
395 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
397 return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
400 #ifdef CONFIG_IP_TRANSPARENT_PROXY
401 #define secondlist(hpnum, sk, fpass) \
402 ({ struct sock *s1; if(!(sk) && (fpass)--) \
403 s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
404 else \
405 s1 = (sk); \
406 s1; \
409 #define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
410 secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
412 #define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
413 secondlist((hpnum),(sk)->bind_next,(fpass))
415 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
416 unsigned short rnum, unsigned long laddr,
417 struct device *dev, unsigned short pnum,
418 int dif)
420 struct sock *s, *result = NULL;
421 int badness = -1;
422 u32 paddr = 0;
423 unsigned short hnum = ntohs(num);
424 unsigned short hpnum = ntohs(pnum);
425 int firstpass = 1;
427 if(dev && dev->ip_ptr) {
428 struct in_device *idev = dev->ip_ptr;
430 if(idev->ifa_list)
431 paddr = idev->ifa_list->ifa_local;
434 /* This code must run only from NET_BH. */
435 for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
436 s != NULL;
437 s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
438 if(s->num == hnum || s->num == hpnum) {
439 int score = 0;
440 if(s->dead && (s->state == TCP_CLOSE))
441 continue;
442 if(s->rcv_saddr) {
443 if((s->num != hpnum || s->rcv_saddr != paddr) &&
444 (s->num != hnum || s->rcv_saddr != laddr))
445 continue;
446 score++;
448 if(s->daddr) {
449 if(s->daddr != raddr)
450 continue;
451 score++;
453 if(s->dummy_th.dest) {
454 if(s->dummy_th.dest != rnum)
455 continue;
456 score++;
458 if(s->bound_dev_if) {
459 if(s->bound_dev_if != dif)
460 continue;
461 score++;
463 if(score == 4 && s->num == hnum) {
464 result = s;
465 break;
466 } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
467 result = s;
468 badness = score;
472 return result;
475 #undef secondlist
476 #undef tcp_v4_proxy_loop_init
477 #undef tcp_v4_proxy_loop_next
479 #endif
481 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
483 return secure_tcp_sequence_number(sk->saddr, sk->daddr,
484 skb->h.th->dest,
485 skb->h.th->source);
489 * From tcp.c
493 * Check that a TCP address is unique, don't allow multiple
494 * connects to/from the same address
497 static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
499 int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
500 struct sock * sk;
502 /* Make sure we are allowed to connect here.
503 * But freeze the hash while we snoop around.
505 SOCKHASH_LOCK();
506 sk = tcp_established_hash[hashent];
507 for (; sk != NULL; sk = sk->next) {
508 if(sk->daddr == daddr && /* remote address */
509 sk->dummy_th.dest == dnum && /* remote port */
510 sk->num == snum && /* local port */
511 sk->saddr == saddr) { /* local address */
512 retval = 0;
513 goto out;
517 /* Must check TIME_WAIT'ers too. */
518 sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)];
519 for (; sk != NULL; sk = sk->next) {
520 if(sk->daddr == daddr && /* remote address */
521 sk->dummy_th.dest == dnum && /* remote port */
522 sk->num == snum && /* local port */
523 sk->saddr == saddr) { /* local address */
524 retval = 0;
525 goto out;
528 out:
529 SOCKHASH_UNLOCK();
530 return retval;
535 * This will initiate an outgoing connection.
538 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
540 struct sk_buff *buff;
541 int tmp;
542 struct tcphdr *th;
543 struct rtable *rt;
544 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
545 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
547 if (sk->state != TCP_CLOSE)
548 return(-EISCONN);
550 /* Don't allow a double connect. */
551 if (sk->daddr)
552 return -EINVAL;
554 if (addr_len < sizeof(struct sockaddr_in))
555 return(-EINVAL);
557 if (usin->sin_family != AF_INET) {
558 static int complained;
559 if (usin->sin_family)
560 return(-EAFNOSUPPORT);
561 if (!complained++)
562 printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
565 if (sk->dst_cache) {
566 dst_release(sk->dst_cache);
567 sk->dst_cache = NULL;
570 tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
571 RT_TOS(sk->ip_tos)|(sk->localroute || 0), sk->bound_dev_if);
572 if (tmp < 0)
573 return tmp;
575 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
576 ip_rt_put(rt);
577 return -ENETUNREACH;
580 if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
581 usin->sin_port)) {
582 ip_rt_put(rt);
583 return -EADDRNOTAVAIL;
586 lock_sock(sk);
588 /* Do this early, so there is less state to unwind on failure. */
589 buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
590 if (buff == NULL) {
591 release_sock(sk);
592 ip_rt_put(rt);
593 return(-ENOBUFS);
596 sk->dst_cache = &rt->u.dst;
597 sk->daddr = rt->rt_dst;
598 if (!sk->saddr)
599 sk->saddr = rt->rt_src;
600 sk->rcv_saddr = sk->saddr;
602 if (sk->priority == 0)
603 sk->priority = rt->u.dst.priority;
605 sk->dummy_th.dest = usin->sin_port;
607 sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
608 sk->dummy_th.source,
609 usin->sin_port);
611 tp->snd_wnd = 0;
612 tp->snd_wl1 = 0;
613 tp->snd_wl2 = sk->write_seq;
614 tp->snd_una = sk->write_seq;
616 tp->rcv_nxt = 0;
618 sk->err = 0;
620 /* Put in the IP header and routing stuff. */
621 tmp = ip_build_header(buff, sk);
622 if (tmp < 0) {
623 /* Caller has done ip_rt_put(rt) and set sk->dst_cache
624 * to NULL. We must unwind the half built TCP socket
625 * state so that this failure does not create a "stillborn"
626 * sock (ie. future re-tries of connect() would fail).
628 sk->daddr = 0;
629 sk->saddr = sk->rcv_saddr = 0;
630 kfree_skb(buff, FREE_WRITE);
631 release_sock(sk);
632 return(-ENETUNREACH);
635 /* No failure conditions can result past this point. */
637 th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
638 buff->h.th = th;
640 memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
641 buff->seq = sk->write_seq++;
642 th->seq = htonl(buff->seq);
643 tp->snd_nxt = sk->write_seq;
644 buff->end_seq = sk->write_seq;
645 th->ack = 0;
646 th->syn = 1;
648 sk->mtu = rt->u.dst.pmtu;
649 if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
650 (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
651 rt->rt_flags&RTCF_NOPMTUDISC)) &&
652 rt->u.dst.pmtu > 576)
653 sk->mtu = 576;
655 if(sk->mtu < 64)
656 sk->mtu = 64; /* Sanity limit */
658 if (sk->user_mss)
659 sk->mss = sk->user_mss;
660 else
661 sk->mss = (sk->mtu - sizeof(struct iphdr) -
662 sizeof(struct tcphdr));
664 if (sk->mss < 1) {
665 printk(KERN_DEBUG "intial sk->mss below 1\n");
666 sk->mss = 1; /* Sanity limit */
669 tp->window_clamp = rt->u.dst.window;
670 tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
671 &tp->rcv_wnd,
672 &tp->window_clamp,
673 sysctl_tcp_window_scaling,
674 &tp->rcv_wscale);
675 th->window = htons(tp->rcv_wnd);
677 tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
678 sysctl_tcp_timestamps,
679 sysctl_tcp_window_scaling,tp->rcv_wscale);
680 buff->csum = 0;
681 th->doff = (sizeof(*th)+ tmp)>>2;
683 tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff);
685 tcp_set_state(sk,TCP_SYN_SENT);
687 /* Socket identity change complete, no longer
688 * in TCP_CLOSE, so rehash.
690 tcp_v4_rehash(sk);
692 tp->rto = rt->u.dst.rtt;
694 tcp_init_xmit_timers(sk);
696 /* Now works the right way instead of a hacked initial setting. */
697 tp->retransmits = 0;
699 skb_queue_tail(&sk->write_queue, buff);
701 tp->packets_out++;
702 buff->when = jiffies;
704 ip_queue_xmit(skb_clone(buff, GFP_KERNEL));
706 /* Timer for repeating the SYN until an answer. */
707 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
708 tcp_statistics.TcpActiveOpens++;
709 tcp_statistics.TcpOutSegs++;
711 release_sock(sk);
712 return(0);
715 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
717 int retval = -EINVAL;
719 /* Do sanity checking for sendmsg/sendto/send. */
720 if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT))
721 goto out;
722 if (msg->msg_name) {
723 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
725 if (msg->msg_namelen < sizeof(*addr))
726 goto out;
727 if (addr->sin_family && addr->sin_family != AF_INET)
728 goto out;
729 retval = -ENOTCONN;
730 if(sk->state == TCP_CLOSE)
731 goto out;
732 retval = -EISCONN;
733 if (addr->sin_port != sk->dummy_th.dest)
734 goto out;
735 if (addr->sin_addr.s_addr != sk->daddr)
736 goto out;
739 lock_sock(sk);
740 retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
741 msg->msg_flags);
743 release_sock(sk);
745 out:
746 return retval;
751 * Do a linear search in the socket open_request list.
752 * This should be replaced with a global hash table.
754 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
755 struct iphdr *iph,
756 struct tcphdr *th,
757 struct open_request **prevp)
759 struct open_request *req, *prev;
760 __u16 rport = th->source;
762 /* assumption: the socket is not in use.
763 * as we checked the user count on tcp_rcv and we're
764 * running from a soft interrupt.
766 prev = (struct open_request *) (&tp->syn_wait_queue);
767 for (req = prev->dl_next; req; req = req->dl_next) {
768 if (req->af.v4_req.rmt_addr == iph->saddr &&
769 req->af.v4_req.loc_addr == iph->daddr &&
770 req->rmt_port == rport) {
771 *prevp = prev;
772 return req;
774 prev = req;
776 return NULL;
781 * This routine does path mtu discovery as defined in RFC1197.
783 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
785 int new_mtu;
786 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
788 /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
789 * send out by Linux are always <576bytes so they should go through
790 * unfragmented).
792 if (sk->state == TCP_LISTEN)
793 return;
795 /* We don't check in the destentry if pmtu discovery is forbidden
796 * on this route. We just assume that no packet_to_big packets
797 * are send back when pmtu discovery is not active.
798 * There is a small race when the user changes this flag in the
799 * route, but I think that's acceptable.
801 if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
802 new_mtu = sk->dst_cache->pmtu -
803 (ip->ihl<<2) - tp->tcp_header_len;
804 if (new_mtu < sk->mss && new_mtu > 0) {
805 sk->mss = new_mtu;
806 /* Resend the TCP packet because it's
807 * clear that the old packet has been
808 * dropped. This is the new "fast" path mtu
809 * discovery.
811 if (!sk->sock_readers)
812 tcp_simple_retransmit(sk);
818 * This routine is called by the ICMP module when it gets some
819 * sort of error condition. If err < 0 then the socket should
820 * be closed and the error returned to the user. If err > 0
821 * it's just the icmp type << 8 | icmp code. After adjustment
822 * header points to the first 8 bytes of the tcp header. We need
823 * to find the appropriate port.
826 void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
828 struct iphdr *iph = (struct iphdr*)dp;
829 struct tcphdr *th;
830 struct tcp_opt *tp;
831 int type = skb->h.icmph->type;
832 int code = skb->h.icmph->code;
833 struct sock *sk;
834 int opening;
835 #ifdef ICMP_PARANOIA
836 __u32 seq;
837 #endif
839 if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
840 icmp_statistics.IcmpInErrors++;
841 return;
844 th = (struct tcphdr*)(dp+(iph->ihl<<2));
846 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
847 if (sk == NULL) {
848 icmp_statistics.IcmpInErrors++;
849 return;
852 tp = &sk->tp_pinfo.af_tcp;
853 #ifdef ICMP_PARANOIA
854 seq = ntohl(th->seq);
855 if (sk->state != TCP_LISTEN &&
856 !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) {
857 if (net_ratelimit())
858 printk(KERN_DEBUG "icmp packet outside the tcp window:"
859 " s:%d %u,%u,%u\n",
860 (int)sk->state, seq, tp->snd_una, tp->snd_nxt);
861 return;
863 #endif
865 switch (type) {
866 case ICMP_SOURCE_QUENCH:
867 tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
868 tp->snd_cwnd = tp->snd_ssthresh;
869 tp->high_seq = tp->snd_nxt;
870 return;
871 case ICMP_PARAMETERPROB:
872 sk->err=EPROTO;
873 sk->error_report(sk);
874 break;
875 case ICMP_DEST_UNREACH:
876 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
877 do_pmtu_discovery(sk, iph);
878 return;
880 break;
883 /* If we've already connected we will keep trying
884 * until we time out, or the user gives up.
886 if (code > NR_ICMP_UNREACH)
887 return;
889 opening = 0;
890 switch (sk->state) {
891 struct open_request *req, *prev;
892 case TCP_LISTEN:
893 /* Prevent race conditions with accept() -
894 * ICMP is unreliable.
896 if (sk->sock_readers) {
897 /* XXX: add a counter here to profile this.
898 * If too many ICMPs get dropped on busy
899 * servers this needs to be solved differently.
901 return;
904 if (!th->syn && !th->ack)
905 return;
906 req = tcp_v4_search_req(tp, iph, th, &prev);
907 if (!req)
908 return;
909 #ifdef ICMP_PARANOIA
910 if (seq != req->snt_isn) {
911 if (net_ratelimit())
912 printk(KERN_DEBUG "icmp packet for openreq "
913 "with wrong seq number:%d:%d\n",
914 seq, req->snt_isn);
915 return;
917 #endif
918 if (req->sk) { /* not yet accept()ed */
919 sk = req->sk; /* report error in accept */
920 } else {
921 tcp_synq_unlink(tp, req, prev);
922 req->class->destructor(req);
923 tcp_openreq_free(req);
925 /* FALL THOUGH */
926 case TCP_SYN_SENT:
927 case TCP_SYN_RECV:
928 opening = 1;
929 break;
932 if(icmp_err_convert[code].fatal || opening) {
933 sk->err = icmp_err_convert[code].errno;
934 if (opening) {
935 tcp_statistics.TcpAttemptFails++;
936 if (sk->state != TCP_LISTEN)
937 tcp_set_state(sk,TCP_CLOSE);
938 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
940 } else /* Only an error on timeout */
941 sk->err_soft = icmp_err_convert[code].errno;
944 /* This routine computes an IPv4 TCP checksum. */
945 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
946 struct sk_buff *skb)
948 th->check = 0;
949 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
950 csum_partial((char *)th, th->doff<<2, skb->csum));
954 * This routine will send an RST to the other tcp.
956 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
957 * for reset.
958 * Answer: if a packet caused RST, it is not for a socket
959 * existing in our system, if it is matched to a socket,
960 * it is just duplicate segment or bug in other side's TCP.
961 * So that we build reply only basing on parameters
962 * arrived with segment.
963 * Exception: precedence violation. We do not implement it in any case.
966 static void tcp_v4_send_reset(struct sk_buff *skb)
968 struct tcphdr *th = skb->h.th;
969 struct sk_buff *skb1;
970 struct tcphdr *th1;
972 if (th->rst)
973 return;
975 skb1 = ip_reply(skb, sizeof(struct tcphdr));
976 if (skb1 == NULL)
977 return;
979 skb1->h.th = th1 = (struct tcphdr *)skb_put(skb1, sizeof(struct tcphdr));
980 memset(th1, 0, sizeof(*th1));
982 /* Swap the send and the receive. */
983 th1->dest = th->source;
984 th1->source = th->dest;
985 th1->doff = sizeof(*th1)/4;
986 th1->rst = 1;
988 if (th->ack)
989 th1->seq = th->ack_seq;
990 else {
991 th1->ack = 1;
992 if (!th->syn)
993 th1->ack_seq = th->seq;
994 else
995 th1->ack_seq = htonl(ntohl(th->seq)+1);
998 skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
999 th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
1000 skb1->nh.iph->daddr, skb1->csum);
1001 /* FIXME: should this carry an options packet? */
1002 ip_queue_xmit(skb1);
1003 tcp_statistics.TcpOutSegs++;
1004 tcp_statistics.TcpOutRsts++;
1007 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1009 * Check whether a received TCP packet might be for one of our
1010 * connections.
1013 int tcp_chkaddr(struct sk_buff *skb)
1015 struct iphdr *iph = skb->nh.iph;
1016 struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
1017 struct sock *sk;
1019 sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest, skb->dev->ifindex);
1021 if (!sk)
1022 return 0;
1024 /* 0 means accept all LOCAL addresses here, not all the world... */
1026 if (sk->rcv_saddr == 0)
1027 return 0;
1029 return 1;
1031 #endif
1033 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
1035 struct sk_buff * skb;
1036 struct tcphdr *th;
1037 int tmp;
1038 int mss;
1040 skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
1041 if (skb == NULL)
1042 return;
1044 if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr,
1045 req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) {
1046 kfree_skb(skb, FREE_WRITE);
1047 return;
1050 mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
1051 if (sk->user_mss)
1052 mss = min(mss, sk->user_mss);
1053 skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
1055 /* Don't offer more than they did.
1056 * This way we don't have to memorize who said what.
1057 * FIXME: maybe this should be changed for better performance
1058 * with syncookies.
1060 req->mss = min(mss, req->mss);
1062 if (req->mss < 1) {
1063 printk(KERN_DEBUG "initial req->mss below 1\n");
1064 req->mss = 1;
1067 /* Yuck, make this header setup more efficient... -DaveM */
1068 memset(th, 0, sizeof(struct tcphdr));
1069 th->syn = 1;
1070 th->ack = 1;
1071 th->source = sk->dummy_th.source;
1072 th->dest = req->rmt_port;
1073 skb->seq = req->snt_isn;
1074 skb->end_seq = skb->seq + 1;
1075 th->seq = htonl(skb->seq);
1076 th->ack_seq = htonl(req->rcv_isn + 1);
1077 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1078 __u8 rcv_wscale;
1079 /* Set this up on the first call only */
1080 req->window_clamp = skb->dst->window;
1081 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
1082 &req->rcv_wnd,
1083 &req->window_clamp,
1084 req->wscale_ok,
1085 &rcv_wscale);
1086 req->rcv_wscale = rcv_wscale;
1088 th->window = htons(req->rcv_wnd);
1090 /* XXX Partial csum of 4 byte quantity is itself! -DaveM
1091 * Yes, but it's a bit harder to special case now. It's
1092 * now computed inside the tcp_v4_send_check() to clean up
1093 * updating the options fields in the mainline send code.
1094 * If someone thinks this is really bad let me know and
1095 * I'll try to do it a different way. -- erics
1098 tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
1099 req->wscale_ok,req->rcv_wscale);
1100 skb->csum = 0;
1101 th->doff = (sizeof(*th) + tmp)>>2;
1102 th->check = tcp_v4_check(th, sizeof(*th) + tmp,
1103 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1104 csum_partial((char *)th, sizeof(*th)+tmp, skb->csum));
1106 ip_queue_xmit(skb);
1107 tcp_statistics.TcpOutSegs++;
1110 static void tcp_v4_or_free(struct open_request *req)
1112 if(!req->sk && req->af.v4_req.opt)
1113 kfree_s(req->af.v4_req.opt,
1114 sizeof(struct ip_options) + req->af.v4_req.opt->optlen);
1117 static inline void syn_flood_warning(struct sk_buff *skb)
1119 static unsigned long warntime;
1121 if (jiffies - warntime > HZ*60) {
1122 warntime = jiffies;
1123 printk(KERN_INFO
1124 "possible SYN flooding on port %d. Sending cookies.\n",
1125 ntohs(skb->h.th->dest));
1129 int sysctl_max_syn_backlog = 1024;
1130 int sysctl_tcp_syn_taildrop = 1;
1132 struct or_calltable or_ipv4 = {
1133 tcp_v4_send_synack,
1134 tcp_v4_or_free,
1135 tcp_v4_send_reset
1138 #ifdef NEW_LISTEN
1139 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
1140 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
1141 #else
1142 #define BACKLOG(sk) ((sk)->ack_backlog)
1143 #define BACKLOGMAX(sk) ((sk)->max_ack_backlog)
1144 #endif
1146 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
1147 __u32 isn)
1149 struct ip_options *opt = (struct ip_options *) ptr;
1150 struct tcp_opt tp;
1151 struct open_request *req;
1152 struct tcphdr *th = skb->h.th;
1153 __u32 saddr = skb->nh.iph->saddr;
1154 __u32 daddr = skb->nh.iph->daddr;
1155 #ifdef CONFIG_SYN_COOKIES
1156 int want_cookie = 0;
1157 #else
1158 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1159 #endif
1161 /* If the socket is dead, don't accept the connection. */
1162 if (sk->dead)
1163 goto dead;
1165 /* XXX: Check against a global syn pool counter. */
1166 if (BACKLOG(sk) > BACKLOGMAX(sk)) {
1167 #ifdef CONFIG_SYN_COOKIES
1168 if (sysctl_tcp_syncookies) {
1169 syn_flood_warning(skb);
1170 want_cookie = 1;
1171 } else
1172 #endif
1173 if (sysctl_tcp_syn_taildrop) {
1174 struct open_request *req;
1176 req = tcp_synq_unlink_tail(&sk->tp_pinfo.af_tcp);
1177 tcp_openreq_free(req);
1178 tcp_statistics.TcpAttemptFails++;
1179 } else {
1180 goto error;
1182 } else {
1183 if (isn == 0)
1184 isn = tcp_v4_init_sequence(sk, skb);
1185 BACKLOG(sk)++;
1188 req = tcp_openreq_alloc();
1189 if (req == NULL) {
1190 if (!want_cookie) BACKLOG(sk)--;
1191 goto error;
1194 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1196 req->rcv_isn = skb->seq;
1197 tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
1198 tp.in_mss = 536;
1199 tcp_parse_options(th,&tp,want_cookie);
1200 if (tp.saw_tstamp)
1201 req->ts_recent = tp.rcv_tsval;
1202 req->mss = tp.in_mss;
1203 req->tstamp_ok = tp.tstamp_ok;
1204 req->sack_ok = tp.sack_ok;
1205 req->snd_wscale = tp.snd_wscale;
1206 req->wscale_ok = tp.wscale_ok;
1207 req->rmt_port = th->source;
1208 req->af.v4_req.loc_addr = daddr;
1209 req->af.v4_req.rmt_addr = saddr;
1211 /* Note that we ignore the isn passed from the TIME_WAIT
1212 * state here. That's the price we pay for cookies.
1214 if (want_cookie)
1215 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1217 req->snt_isn = isn;
1219 /* IPv4 options */
1220 req->af.v4_req.opt = NULL;
1222 if (opt && opt->optlen) {
1223 int opt_size = sizeof(struct ip_options) + opt->optlen;
1225 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
1226 if (req->af.v4_req.opt) {
1227 if (ip_options_echo(req->af.v4_req.opt, skb)) {
1228 kfree_s(req->af.v4_req.opt, opt_size);
1229 req->af.v4_req.opt = NULL;
1233 req->class = &or_ipv4;
1234 req->retrans = 0;
1235 req->sk = NULL;
1237 tcp_v4_send_synack(sk, req);
1239 if (want_cookie) {
1240 if (req->af.v4_req.opt)
1241 kfree(req->af.v4_req.opt);
1242 tcp_openreq_free(req);
1243 } else {
1244 req->expires = jiffies + TCP_TIMEOUT_INIT;
1245 tcp_inc_slow_timer(TCP_SLT_SYNACK);
1246 tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
1249 sk->data_ready(sk, 0);
1250 exit:
1251 return 0;
1253 dead:
1254 SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
1255 tcp_statistics.TcpAttemptFails++;
1256 return -ENOTCONN;
1257 error:
1258 tcp_statistics.TcpAttemptFails++;
1259 goto exit;
1262 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1263 struct open_request *req,
1264 struct dst_entry *dst)
1266 struct tcp_opt *newtp;
1267 struct sock *newsk;
1268 int snd_mss;
1270 #ifdef NEW_LISTEN
1271 if (sk->ack_backlog > sk->max_ack_backlog)
1272 goto exit; /* head drop */
1273 #endif
1274 newsk = sk_alloc(AF_INET, GFP_ATOMIC);
1275 if (!newsk)
1276 goto exit;
1277 #ifdef NEW_LISTEN
1278 sk->ack_backlog++;
1279 #endif
1280 memcpy(newsk, sk, sizeof(*newsk));
1282 /* Or else we die! -DaveM */
1283 newsk->sklist_next = NULL;
1285 newsk->opt = req->af.v4_req.opt;
1287 skb_queue_head_init(&newsk->write_queue);
1288 skb_queue_head_init(&newsk->receive_queue);
1289 skb_queue_head_init(&newsk->out_of_order_queue);
1290 skb_queue_head_init(&newsk->error_queue);
1292 /* Unused */
1293 newtp = &(newsk->tp_pinfo.af_tcp);
1294 newtp->send_head = NULL;
1295 newtp->retrans_head = NULL;
1297 newtp->pending = 0;
1299 skb_queue_head_init(&newsk->back_log);
1301 newsk->prot->init(newsk);
1303 newtp->snd_cwnd_cnt = 0;
1304 newtp->backoff = 0;
1305 newsk->proc = 0;
1306 newsk->done = 0;
1307 newsk->pair = NULL;
1308 atomic_set(&newsk->wmem_alloc, 0);
1309 atomic_set(&newsk->rmem_alloc, 0);
1310 newsk->localroute = sk->localroute;
1312 newsk->err = 0;
1313 newsk->shutdown = 0;
1314 newsk->ack_backlog = 0;
1316 newtp->fin_seq = req->rcv_isn;
1317 newsk->syn_seq = req->rcv_isn;
1318 newsk->state = TCP_SYN_RECV;
1319 newsk->timeout = 0;
1321 newsk->write_seq = req->snt_isn;
1323 newtp->snd_wnd = ntohs(skb->h.th->window);
1324 newtp->max_window = newtp->snd_wnd;
1325 newtp->snd_wl1 = req->rcv_isn;
1326 newtp->snd_wl2 = newsk->write_seq;
1327 newtp->snd_una = newsk->write_seq++;
1328 newtp->snd_nxt = newsk->write_seq;
1330 newsk->urg_data = 0;
1331 newtp->packets_out = 0;
1332 newtp->retransmits = 0;
1333 newsk->linger=0;
1334 newsk->destroy = 0;
1335 init_timer(&newsk->timer);
1336 newsk->timer.data = (unsigned long) newsk;
1337 newsk->timer.function = &net_timer;
1339 tcp_init_xmit_timers(newsk);
1341 newsk->dummy_th.source = sk->dummy_th.source;
1342 newsk->dummy_th.dest = req->rmt_port;
1343 newsk->sock_readers=0;
1345 newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1;
1346 newtp->rcv_wup = req->rcv_isn + 1;
1347 newsk->copied_seq = req->rcv_isn + 1;
1349 newsk->socket = NULL;
1351 newsk->daddr = req->af.v4_req.rmt_addr;
1352 newsk->saddr = req->af.v4_req.loc_addr;
1353 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1355 /* options / mss / route_cache */
1356 if (dst == NULL) {
1357 struct rtable *rt;
1359 if (ip_route_output(&rt,
1360 newsk->opt && newsk->opt->srr ?
1361 newsk->opt->faddr : newsk->daddr,
1362 newsk->saddr, newsk->ip_tos, 0)) {
1363 sk_free(newsk);
1364 return NULL;
1366 dst = &rt->u.dst;
1368 newsk->dst_cache = dst;
1370 snd_mss = dst->pmtu;
1372 /* FIXME: is mtu really the same as snd_mss? */
1373 newsk->mtu = snd_mss;
1374 /* FIXME: where does mtu get used after this? */
1375 /* sanity check */
1376 if (newsk->mtu < 64)
1377 newsk->mtu = 64;
1379 newtp->sack_ok = req->sack_ok;
1380 newtp->tstamp_ok = req->tstamp_ok;
1381 newtp->window_clamp = req->window_clamp;
1382 newtp->rcv_wnd = req->rcv_wnd;
1383 newtp->wscale_ok = req->wscale_ok;
1384 if (newtp->wscale_ok) {
1385 newtp->snd_wscale = req->snd_wscale;
1386 newtp->rcv_wscale = req->rcv_wscale;
1387 } else {
1388 newtp->snd_wscale = newtp->rcv_wscale = 0;
1389 newtp->window_clamp = min(newtp->window_clamp,65535);
1391 if (newtp->tstamp_ok) {
1392 newtp->ts_recent = req->ts_recent;
1393 newtp->ts_recent_stamp = jiffies;
1394 newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define constant! */
1395 newsk->dummy_th.doff += 3;
1396 } else {
1397 newtp->tcp_header_len = sizeof(struct tcphdr);
1400 snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr);
1401 if (sk->user_mss)
1402 snd_mss = min(snd_mss, sk->user_mss);
1404 /* Make sure our mtu is adjusted for headers. */
1405 newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
1407 tcp_v4_hash(newsk);
1408 add_to_prot_sklist(newsk);
1409 return newsk;
1411 exit:
1412 if (dst)
1413 dst_release(dst);
1414 return NULL;
1417 static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
1419 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1420 struct open_request *req, *prev;
1422 req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
1423 if (!req)
1424 return;
1425 /* Sequence number check required by RFC793 */
1426 if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1))
1427 return;
1428 tcp_synq_unlink(tp, req, prev);
1429 req->class->destructor(req);
1430 tcp_openreq_free(req);
1433 /* Check for embryonic sockets (open_requests) We check packets with
1434 * only the SYN bit set against the open_request queue too: This
1435 * increases connection latency a bit, but is required to detect
1436 * retransmitted SYNs.
1438 static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1440 struct tcphdr *th = skb->h.th;
1441 u32 flg = ((u32 *)th)[3];
1443 /* Check for RST */
1444 if (flg & __constant_htonl(0x00040000)) {
1445 tcp_v4_rst_req(sk, skb);
1446 return NULL;
1449 /* Check for SYN|ACK */
1450 if (flg & __constant_htonl(0x00120000)) {
1451 struct open_request *req, *dummy;
1452 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1454 /* Find possible connection requests. */
1455 req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
1456 if (req) {
1457 sk = tcp_check_req(sk, skb, req);
1459 #ifdef CONFIG_SYN_COOKIES
1460 else {
1461 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1463 #endif
1465 return sk;
1468 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1470 #ifdef CONFIG_FILTER
1471 if (sk->filter)
1473 if (sk_filter(skb, sk->filter_data, sk->filter))
1474 return -EPERM; /* Toss packet */
1476 #endif /* CONFIG_FILTER */
1478 skb_set_owner_r(skb, sk);
1481 * socket locking is here for SMP purposes as backlog rcv
1482 * is currently called with bh processing disabled.
1484 lock_sock(sk);
1486 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1487 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1488 goto reset;
1489 release_sock(sk);
1490 return 0;
1494 if (sk->state == TCP_LISTEN) {
1495 struct sock *nsk;
1497 nsk = tcp_v4_hnd_req(sk, skb);
1498 if (!nsk)
1499 goto discard;
1500 lock_sock(nsk);
1501 release_sock(sk);
1502 sk = nsk;
1505 if (tcp_rcv_state_process(sk, skb, skb->h.th,
1506 &(IPCB(skb)->opt), skb->len))
1507 goto reset;
1508 release_sock(sk);
1509 return 0;
1511 reset:
1512 tcp_v4_send_reset(skb);
1513 discard:
1514 kfree_skb(skb, FREE_READ);
1515 /* Be careful here. If this function gets more complicated and
1516 * gcc suffers from register pressure on the x86, sk (in %ebx)
1517 * might be destroyed here. This current version compiles correctly,
1518 * but you have been warned.
1520 release_sock(sk);
1521 return 0;
1525 * From tcp_input.c
1528 int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
1530 struct tcphdr *th;
1531 struct sock *sk;
1533 if (skb->pkt_type!=PACKET_HOST)
1534 goto discard_it;
1536 th = skb->h.th;
1538 /* Pull up the IP header. */
1539 __skb_pull(skb, skb->h.raw - skb->data);
1541 /* Count it even if it's bad */
1542 tcp_statistics.TcpInSegs++;
1544 /* Try to use the device checksum if provided. */
1545 switch (skb->ip_summed) {
1546 case CHECKSUM_NONE:
1547 skb->csum = csum_partial((char *)th, len, 0);
1548 case CHECKSUM_HW:
1549 if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
1550 printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n",
1551 NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), NIPQUAD(skb->nh.iph->daddr),
1552 ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len));
1553 tcp_statistics.TcpInErrs++;
1554 goto discard_it;
1556 default:
1557 /* CHECKSUM_UNNECESSARY */
1560 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1561 if (IPCB(skb)->redirport)
1562 sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
1563 skb->nh.iph->daddr, skb->dev,
1564 IPCB(skb)->redirport, skb->dev->ifindex);
1565 else
1566 #endif
1567 sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
1568 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1569 if (!sk)
1570 goto no_tcp_socket;
1571 if(!ipsec_sk_policy(sk,skb))
1572 goto discard_it;
1574 skb->seq = ntohl(th->seq);
1575 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1576 skb->ack_seq = ntohl(th->ack_seq);
1578 skb->used = 0;
1580 if (!sk->sock_readers)
1581 return tcp_v4_do_rcv(sk, skb);
1583 __skb_queue_tail(&sk->back_log, skb);
1584 return 0;
1586 no_tcp_socket:
1587 tcp_v4_send_reset(skb);
1589 discard_it:
1590 /* Discard frame. */
1591 kfree_skb(skb, FREE_READ);
1592 return 0;
1595 int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
1597 return ip_build_header(skb, sk);
1600 int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb)
1602 struct rtable *rt;
1603 struct iphdr *iph;
1604 struct tcphdr *th;
1605 int size;
1606 int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
1608 /* Check route */
1610 rt = (struct rtable*)skb->dst;
1612 /* Force route checking if want_rewrite */
1613 if (want_rewrite) {
1614 int tmp;
1615 __u32 old_saddr = rt->rt_src;
1617 /* Query new route */
1618 tmp = ip_route_connect(&rt, rt->rt_dst, 0,
1619 RT_TOS(sk->ip_tos)|(sk->localroute||0),
1620 sk->bound_dev_if);
1622 /* Only useful if different source addrs */
1623 if (tmp == 0 || rt->rt_src != old_saddr ) {
1624 dst_release(skb->dst);
1625 skb->dst = &rt->u.dst;
1626 } else {
1627 want_rewrite = 0;
1628 dst_release(&rt->u.dst);
1630 } else
1631 if (rt->u.dst.obsolete) {
1632 int err;
1633 err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.oif);
1634 if (err) {
1635 sk->err_soft=-err;
1636 sk->error_report(skb->sk);
1637 return -1;
1639 dst_release(skb->dst);
1640 skb->dst = &rt->u.dst;
1643 /* Discard the surplus MAC header. */
1644 skb_pull(skb, skb->nh.raw-skb->data);
1646 iph = skb->nh.iph;
1647 th = skb->h.th;
1648 size = skb->tail - skb->h.raw;
1650 if (want_rewrite) {
1651 __u32 new_saddr = rt->rt_src;
1654 * Ouch!, this should not happen.
1656 if (!sk->saddr || !sk->rcv_saddr) {
1657 printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n",
1658 ntohl(sk->saddr),
1659 ntohl(sk->rcv_saddr));
1660 return 0;
1664 * Maybe whe are in a skb chain loop and socket address has
1665 * yet been 'damaged'.
1668 if (new_saddr != sk->saddr) {
1669 if (sysctl_ip_dynaddr > 1) {
1670 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1671 NIPQUAD(sk->saddr),
1672 NIPQUAD(new_saddr));
1675 sk->saddr = new_saddr;
1676 sk->rcv_saddr = new_saddr;
1677 /* sk->prot->rehash(sk); */
1678 tcp_v4_rehash(sk);
1681 if (new_saddr != iph->saddr) {
1682 if (sysctl_ip_dynaddr > 1) {
1683 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1684 NIPQUAD(iph->saddr),
1685 NIPQUAD(new_saddr));
1688 iph->saddr = new_saddr;
1689 ip_send_check(iph);
1694 return 0;
1697 static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
1699 return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1700 skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
1703 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1705 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1707 sin->sin_family = AF_INET;
1708 sin->sin_addr.s_addr = sk->daddr;
1709 sin->sin_port = sk->dummy_th.dest;
1712 struct tcp_func ipv4_specific = {
1713 tcp_v4_build_header,
1714 ip_queue_xmit,
1715 tcp_v4_send_check,
1716 tcp_v4_rebuild_header,
1717 tcp_v4_conn_request,
1718 tcp_v4_syn_recv_sock,
1719 tcp_v4_get_sock,
1720 ip_setsockopt,
1721 ip_getsockopt,
1722 v4_addr2sockaddr,
1723 sizeof(struct sockaddr_in)
1726 static int tcp_v4_init_sock(struct sock *sk)
1728 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1730 skb_queue_head_init(&sk->out_of_order_queue);
1731 tcp_init_xmit_timers(sk);
1733 tp->srtt = 0;
1734 tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
1735 tp->mdev = TCP_TIMEOUT_INIT;
1737 tp->ato = 0;
1738 tp->iat = (HZ/5) << 3;
1740 /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
1741 /* tp->rcv_wnd = 8192; */
1742 tp->tstamp_ok = 0;
1743 tp->sack_ok = 0;
1744 tp->wscale_ok = 0;
1745 tp->in_mss = 536;
1746 tp->snd_wscale = 0;
1747 tp->sacks = 0;
1748 tp->saw_tstamp = 0;
1749 tp->syn_backlog = 0;
1752 * See draft-stevens-tcpca-spec-01 for discussion of the
1753 * initialization of these values.
1755 tp->snd_cwnd = 1;
1756 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1758 sk->priority = 1;
1759 sk->state = TCP_CLOSE;
1761 sk->max_ack_backlog = SOMAXCONN;
1763 sk->mtu = 576;
1764 sk->mss = 536;
1766 /* Speed up by setting some standard state for the dummy_th. */
1767 sk->dummy_th.ack=1;
1768 sk->dummy_th.doff=sizeof(struct tcphdr)>>2;
1770 /* Init SYN queue. */
1771 tcp_synq_init(tp);
1773 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1775 return 0;
1778 static int tcp_v4_destroy_sock(struct sock *sk)
1780 struct sk_buff *skb;
1782 tcp_clear_xmit_timers(sk);
1784 if (sk->keepopen)
1785 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1787 /* Cleanup up the write buffer. */
1788 while((skb = skb_dequeue(&sk->write_queue)) != NULL)
1789 kfree_skb(skb, FREE_WRITE);
1791 /* Cleans up our, hopefuly empty, out_of_order_queue. */
1792 while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
1793 kfree_skb(skb, FREE_READ);
1795 return 0;
1798 struct proto tcp_prot = {
1799 (struct sock *)&tcp_prot, /* sklist_next */
1800 (struct sock *)&tcp_prot, /* sklist_prev */
1801 tcp_close, /* close */
1802 tcp_v4_connect, /* connect */
1803 tcp_accept, /* accept */
1804 NULL, /* retransmit */
1805 tcp_write_wakeup, /* write_wakeup */
1806 tcp_read_wakeup, /* read_wakeup */
1807 tcp_poll, /* poll */
1808 tcp_ioctl, /* ioctl */
1809 tcp_v4_init_sock, /* init */
1810 tcp_v4_destroy_sock, /* destroy */
1811 tcp_shutdown, /* shutdown */
1812 tcp_setsockopt, /* setsockopt */
1813 tcp_getsockopt, /* getsockopt */
1814 tcp_v4_sendmsg, /* sendmsg */
1815 tcp_recvmsg, /* recvmsg */
1816 NULL, /* bind */
1817 tcp_v4_do_rcv, /* backlog_rcv */
1818 tcp_v4_hash, /* hash */
1819 tcp_v4_unhash, /* unhash */
1820 tcp_v4_rehash, /* rehash */
1821 tcp_good_socknum, /* good_socknum */
1822 tcp_v4_verify_bind, /* verify_bind */
1823 128, /* max_header */
1824 0, /* retransmits */
1825 "TCP", /* name */
1826 0, /* inuse */
1827 0 /* highestinuse */