Merge with Linux 2.5.74.
[linux-2.6/linux-mips.git] / net / ipv4 / tcp_minisocks.c
blob5723214ff8da1c944d7496ceb82a2c44bf693905
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/config.h>
24 #include <linux/mm.h>
25 #include <linux/sysctl.h>
26 #include <linux/workqueue.h>
27 #include <net/tcp.h>
28 #include <net/inet_common.h>
29 #include <net/xfrm.h>
31 #ifdef CONFIG_SYSCTL
32 #define SYNC_INIT 0 /* let the user enable it */
33 #else
34 #define SYNC_INIT 1
35 #endif
37 int sysctl_tcp_tw_recycle;
38 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
40 int sysctl_tcp_syncookies = SYNC_INIT;
41 int sysctl_tcp_abort_on_overflow;
43 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
45 if (seq == s_win)
46 return 1;
47 if (after(end_seq, s_win) && before(seq, e_win))
48 return 1;
49 return (seq == e_win && seq == end_seq);
52 /* New-style handling of TIME_WAIT sockets. */
54 int tcp_tw_count;
57 /* Must be called with locally disabled BHs. */
58 static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
60 struct tcp_ehash_bucket *ehead;
61 struct tcp_bind_hashbucket *bhead;
62 struct tcp_bind_bucket *tb;
64 /* Unlink from established hashes. */
65 ehead = &tcp_ehash[tw->tw_hashent];
66 write_lock(&ehead->lock);
67 if (hlist_unhashed(&tw->tw_node)) {
68 write_unlock(&ehead->lock);
69 return;
71 __hlist_del(&tw->tw_node);
72 sk_node_init(&tw->tw_node);
73 write_unlock(&ehead->lock);
75 /* Disassociate with bind bucket. */
76 bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
77 spin_lock(&bhead->lock);
78 tb = tw->tw_tb;
79 __hlist_del(&tw->tw_bind_node);
80 tw->tw_tb = NULL;
81 tcp_bucket_destroy(tb);
82 spin_unlock(&bhead->lock);
84 #ifdef INET_REFCNT_DEBUG
85 if (atomic_read(&tw->tw_refcnt) != 1) {
86 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
87 atomic_read(&tw->tw_refcnt));
89 #endif
90 tcp_tw_put(tw);
93 /*
94 * * Main purpose of TIME-WAIT state is to close connection gracefully,
95 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
96 * (and, probably, tail of data) and one or more our ACKs are lost.
97 * * What is TIME-WAIT timeout? It is associated with maximal packet
98 * lifetime in the internet, which results in wrong conclusion, that
99 * it is set to catch "old duplicate segments" wandering out of their path.
100 * It is not quite correct. This timeout is calculated so that it exceeds
101 * maximal retransmission timeout enough to allow to lose one (or more)
102 * segments sent by peer and our ACKs. This time may be calculated from RTO.
103 * * When TIME-WAIT socket receives RST, it means that another end
104 * finally closed and we are allowed to kill TIME-WAIT too.
105 * * Second purpose of TIME-WAIT is catching old duplicate segments.
106 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
107 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
108 * * If we invented some more clever way to catch duplicates
109 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
111 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
112 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
113 * from the very beginning.
115 * NOTE. With recycling (and later with fin-wait-2) TW bucket
116 * is _not_ stateless. It means, that strictly speaking we must
117 * spinlock it. I do not want! Well, probability of misbehaviour
118 * is ridiculously low and, seems, we could use some mb() tricks
119 * to avoid misread sequence numbers, states etc. --ANK
121 enum tcp_tw_status
122 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
123 struct tcphdr *th, unsigned len)
125 struct tcp_opt tp;
126 int paws_reject = 0;
128 tp.saw_tstamp = 0;
129 if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
130 tcp_parse_options(skb, &tp, 0);
132 if (tp.saw_tstamp) {
133 tp.ts_recent = tw->tw_ts_recent;
134 tp.ts_recent_stamp = tw->tw_ts_recent_stamp;
135 paws_reject = tcp_paws_check(&tp, th->rst);
139 if (tw->tw_substate == TCP_FIN_WAIT2) {
140 /* Just repeat all the checks of tcp_rcv_state_process() */
142 /* Out of window, send ACK */
143 if (paws_reject ||
144 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
145 tw->tw_rcv_nxt,
146 tw->tw_rcv_nxt + tw->tw_rcv_wnd))
147 return TCP_TW_ACK;
149 if (th->rst)
150 goto kill;
152 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
153 goto kill_with_rst;
155 /* Dup ACK? */
156 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
157 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
158 tcp_tw_put(tw);
159 return TCP_TW_SUCCESS;
162 /* New data or FIN. If new data arrive after half-duplex close,
163 * reset.
165 if (!th->fin ||
166 TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
167 kill_with_rst:
168 tcp_tw_deschedule(tw);
169 tcp_tw_put(tw);
170 return TCP_TW_RST;
173 /* FIN arrived, enter true time-wait state. */
174 tw->tw_substate = TCP_TIME_WAIT;
175 tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
176 if (tp.saw_tstamp) {
177 tw->tw_ts_recent_stamp = xtime.tv_sec;
178 tw->tw_ts_recent = tp.rcv_tsval;
181 /* I am shamed, but failed to make it more elegant.
182 * Yes, it is direct reference to IP, which is impossible
183 * to generalize to IPv6. Taking into account that IPv6
184 * do not undertsnad recycling in any case, it not
185 * a big problem in practice. --ANK */
186 if (tw->tw_family == AF_INET &&
187 sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
188 tcp_v4_tw_remember_stamp(tw))
189 tcp_tw_schedule(tw, tw->tw_timeout);
190 else
191 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
192 return TCP_TW_ACK;
196 * Now real TIME-WAIT state.
198 * RFC 1122:
199 * "When a connection is [...] on TIME-WAIT state [...]
200 * [a TCP] MAY accept a new SYN from the remote TCP to
201 * reopen the connection directly, if it:
203 * (1) assigns its initial sequence number for the new
204 * connection to be larger than the largest sequence
205 * number it used on the previous connection incarnation,
206 * and
208 * (2) returns to TIME-WAIT state if the SYN turns out
209 * to be an old duplicate".
212 if (!paws_reject &&
213 (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
214 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
215 /* In window segment, it may be only reset or bare ack. */
217 if (th->rst) {
218 /* This is TIME_WAIT assasination, in two flavors.
219 * Oh well... nobody has a sufficient solution to this
220 * protocol bug yet.
222 if (sysctl_tcp_rfc1337 == 0) {
223 kill:
224 tcp_tw_deschedule(tw);
225 tcp_tw_put(tw);
226 return TCP_TW_SUCCESS;
229 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
231 if (tp.saw_tstamp) {
232 tw->tw_ts_recent = tp.rcv_tsval;
233 tw->tw_ts_recent_stamp = xtime.tv_sec;
236 tcp_tw_put(tw);
237 return TCP_TW_SUCCESS;
240 /* Out of window segment.
242 All the segments are ACKed immediately.
244 The only exception is new SYN. We accept it, if it is
245 not old duplicate and we are not in danger to be killed
246 by delayed old duplicates. RFC check is that it has
247 newer sequence number works at rates <40Mbit/sec.
248 However, if paws works, it is reliable AND even more,
249 we even may relax silly seq space cutoff.
251 RED-PEN: we violate main RFC requirement, if this SYN will appear
252 old duplicate (i.e. we receive RST in reply to SYN-ACK),
253 we must return socket to time-wait state. It is not good,
254 but not fatal yet.
257 if (th->syn && !th->rst && !th->ack && !paws_reject &&
258 (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
259 (tp.saw_tstamp && (s32)(tw->tw_ts_recent - tp.rcv_tsval) < 0))) {
260 u32 isn = tw->tw_snd_nxt + 65535 + 2;
261 if (isn == 0)
262 isn++;
263 TCP_SKB_CB(skb)->when = isn;
264 return TCP_TW_SYN;
267 if (paws_reject)
268 NET_INC_STATS_BH(PAWSEstabRejected);
270 if(!th->rst) {
271 /* In this case we must reset the TIMEWAIT timer.
273 * If it is ACKless SYN it may be both old duplicate
274 * and new good SYN with random sequence number <rcv_nxt.
275 * Do not reschedule in the last case.
277 if (paws_reject || th->ack)
278 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
280 /* Send ACK. Note, we do not put the bucket,
281 * it will be released by caller.
283 return TCP_TW_ACK;
285 tcp_tw_put(tw);
286 return TCP_TW_SUCCESS;
289 /* Enter the time wait state. This is called with locally disabled BH.
290 * Essentially we whip up a timewait bucket, copy the
291 * relevant info into it from the SK, and mess with hash chains
292 * and list linkage.
294 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
296 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
297 struct tcp_bind_hashbucket *bhead;
299 /* Step 1: Put TW into bind hash. Original socket stays there too.
300 Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
301 binding cache, even if it is closed.
303 bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
304 spin_lock(&bhead->lock);
305 tw->tw_tb = tcp_sk(sk)->bind_hash;
306 BUG_TRAP(tcp_sk(sk)->bind_hash);
307 tw_add_bind_node(tw, &tw->tw_tb->owners);
308 spin_unlock(&bhead->lock);
310 write_lock(&ehead->lock);
312 /* Step 2: Remove SK from established hash. */
313 if (__sk_del_node_init(sk))
314 sock_prot_dec_use(sk->sk_prot);
316 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
317 tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
318 atomic_inc(&tw->tw_refcnt);
320 write_unlock(&ehead->lock);
324 * Move a socket to time-wait or dead fin-wait-2 state.
326 void tcp_time_wait(struct sock *sk, int state, int timeo)
328 struct tcp_tw_bucket *tw = NULL;
329 struct tcp_opt *tp = tcp_sk(sk);
330 int recycle_ok = 0;
332 if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
333 recycle_ok = tp->af_specific->remember_stamp(sk);
335 if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
336 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
338 if(tw != NULL) {
339 struct inet_opt *inet = inet_sk(sk);
340 int rto = (tp->rto<<2) - (tp->rto>>1);
342 /* Give us an identity. */
343 tw->tw_daddr = inet->daddr;
344 tw->tw_rcv_saddr = inet->rcv_saddr;
345 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
346 tw->tw_num = inet->num;
347 tw->tw_state = TCP_TIME_WAIT;
348 tw->tw_substate = state;
349 tw->tw_sport = inet->sport;
350 tw->tw_dport = inet->dport;
351 tw->tw_family = sk->sk_family;
352 tw->tw_reuse = sk->sk_reuse;
353 tw->tw_rcv_wscale = tp->rcv_wscale;
354 atomic_set(&tw->tw_refcnt, 1);
356 tw->tw_hashent = sk->sk_hashent;
357 tw->tw_rcv_nxt = tp->rcv_nxt;
358 tw->tw_snd_nxt = tp->snd_nxt;
359 tw->tw_rcv_wnd = tcp_receive_window(tp);
360 tw->tw_ts_recent = tp->ts_recent;
361 tw->tw_ts_recent_stamp = tp->ts_recent_stamp;
362 tw_dead_node_init(tw);
364 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
365 if (tw->tw_family == PF_INET6) {
366 struct ipv6_pinfo *np = inet6_sk(sk);
368 ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
369 ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
371 #endif
372 /* Linkage updates. */
373 __tcp_tw_hashdance(sk, tw);
375 /* Get the TIME_WAIT timeout firing. */
376 if (timeo < rto)
377 timeo = rto;
379 if (recycle_ok) {
380 tw->tw_timeout = rto;
381 } else {
382 tw->tw_timeout = TCP_TIMEWAIT_LEN;
383 if (state == TCP_TIME_WAIT)
384 timeo = TCP_TIMEWAIT_LEN;
387 tcp_tw_schedule(tw, timeo);
388 tcp_tw_put(tw);
389 } else {
390 /* Sorry, if we're out of memory, just CLOSE this
391 * socket up. We've got bigger problems than
392 * non-graceful socket closings.
394 if (net_ratelimit())
395 printk(KERN_INFO "TCP: time wait bucket table overflow\n");
398 tcp_update_metrics(sk);
399 tcp_done(sk);
402 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
403 static int tcp_tw_death_row_slot;
405 static void tcp_twkill(unsigned long);
407 /* TIME_WAIT reaping mechanism. */
408 #define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
409 #define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
411 #define TCP_TWKILL_QUOTA 100
413 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
414 static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
415 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
416 static void twkill_work(void *);
417 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
418 static u32 twkill_thread_slots;
420 /* Returns non-zero if quota exceeded. */
421 static int tcp_do_twkill_work(int slot, unsigned int quota)
423 struct tcp_tw_bucket *tw;
424 struct hlist_node *node, *safe;
425 unsigned int killed;
426 int ret;
428 /* NOTE: compare this to previous version where lock
429 * was released after detaching chain. It was racy,
430 * because tw buckets are scheduled in not serialized context
431 * in 2.3 (with netfilter), and with softnet it is common, because
432 * soft irqs are not sequenced.
434 killed = 0;
435 ret = 0;
436 tw_for_each_inmate(tw, node, safe,
437 &tcp_tw_death_row[slot]) {
438 __tw_del_dead_node(tw);
439 spin_unlock(&tw_death_lock);
440 tcp_timewait_kill(tw);
441 tcp_tw_put(tw);
442 killed++;
443 spin_lock(&tw_death_lock);
444 if (killed > quota) {
445 ret = 1;
446 break;
450 tcp_tw_count -= killed;
451 NET_ADD_STATS_BH(TimeWaited, killed);
453 return ret;
456 static void tcp_twkill(unsigned long dummy)
458 int need_timer, ret;
460 spin_lock(&tw_death_lock);
462 if (tcp_tw_count == 0)
463 goto out;
465 need_timer = 0;
466 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
467 if (ret) {
468 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
469 mb();
470 schedule_work(&tcp_twkill_work);
471 need_timer = 1;
472 } else {
473 /* We purged the entire slot, anything left? */
474 if (tcp_tw_count)
475 need_timer = 1;
477 tcp_tw_death_row_slot =
478 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
479 if (need_timer)
480 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
481 out:
482 spin_unlock(&tw_death_lock);
485 extern void twkill_slots_invalid(void);
487 static void twkill_work(void *dummy)
489 int i;
491 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
492 twkill_slots_invalid();
494 while (twkill_thread_slots) {
495 spin_lock_bh(&tw_death_lock);
496 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
497 if (!(twkill_thread_slots & (1 << i)))
498 continue;
500 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
501 if (need_resched()) {
502 spin_unlock_bh(&tw_death_lock);
503 schedule();
504 spin_lock_bh(&tw_death_lock);
508 twkill_thread_slots &= ~(1 << i);
510 spin_unlock_bh(&tw_death_lock);
514 /* These are always called from BH context. See callers in
515 * tcp_input.c to verify this.
518 /* This is for handling early-kills of TIME_WAIT sockets. */
519 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
521 spin_lock(&tw_death_lock);
522 if (tw_del_dead_node(tw)) {
523 tcp_tw_put(tw);
524 if (--tcp_tw_count == 0)
525 del_timer(&tcp_tw_timer);
527 spin_unlock(&tw_death_lock);
528 tcp_timewait_kill(tw);
531 /* Short-time timewait calendar */
533 static int tcp_twcal_hand = -1;
534 static int tcp_twcal_jiffie;
535 static void tcp_twcal_tick(unsigned long);
536 static struct timer_list tcp_twcal_timer =
537 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
538 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
540 void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
542 struct hlist_head *list;
543 int slot;
545 /* timeout := RTO * 3.5
547 * 3.5 = 1+2+0.5 to wait for two retransmits.
549 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
550 * our ACK acking that FIN can be lost. If N subsequent retransmitted
551 * FINs (or previous seqments) are lost (probability of such event
552 * is p^(N+1), where p is probability to lose single packet and
553 * time to detect the loss is about RTO*(2^N - 1) with exponential
554 * backoff). Normal timewait length is calculated so, that we
555 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
556 * [ BTW Linux. following BSD, violates this requirement waiting
557 * only for 60sec, we should wait at least for 240 secs.
558 * Well, 240 consumes too much of resources 8)
560 * This interval is not reduced to catch old duplicate and
561 * responces to our wandering segments living for two MSLs.
562 * However, if we use PAWS to detect
563 * old duplicates, we can reduce the interval to bounds required
564 * by RTO, rather than MSL. So, if peer understands PAWS, we
565 * kill tw bucket after 3.5*RTO (it is important that this number
566 * is greater than TS tick!) and detect old duplicates with help
567 * of PAWS.
569 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
571 spin_lock(&tw_death_lock);
573 /* Unlink it, if it was scheduled */
574 if (tw_del_dead_node(tw))
575 tcp_tw_count--;
576 else
577 atomic_inc(&tw->tw_refcnt);
579 if (slot >= TCP_TW_RECYCLE_SLOTS) {
580 /* Schedule to slow timer */
581 if (timeo >= TCP_TIMEWAIT_LEN) {
582 slot = TCP_TWKILL_SLOTS-1;
583 } else {
584 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
585 if (slot >= TCP_TWKILL_SLOTS)
586 slot = TCP_TWKILL_SLOTS-1;
588 tw->tw_ttd = jiffies + timeo;
589 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
590 list = &tcp_tw_death_row[slot];
591 } else {
592 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
594 if (tcp_twcal_hand < 0) {
595 tcp_twcal_hand = 0;
596 tcp_twcal_jiffie = jiffies;
597 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
598 add_timer(&tcp_twcal_timer);
599 } else {
600 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
601 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
602 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
604 list = &tcp_twcal_row[slot];
607 hlist_add_head(&tw->tw_death_node, list);
609 if (tcp_tw_count++ == 0)
610 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
611 spin_unlock(&tw_death_lock);
614 void tcp_twcal_tick(unsigned long dummy)
616 int n, slot;
617 unsigned long j;
618 unsigned long now = jiffies;
619 int killed = 0;
620 int adv = 0;
622 spin_lock(&tw_death_lock);
623 if (tcp_twcal_hand < 0)
624 goto out;
626 slot = tcp_twcal_hand;
627 j = tcp_twcal_jiffie;
629 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
630 if (time_before_eq(j, now)) {
631 struct hlist_node *node, *safe;
632 struct tcp_tw_bucket *tw;
634 tw_for_each_inmate(tw, node, safe,
635 &tcp_twcal_row[slot]) {
636 __tw_del_dead_node(tw);
637 tcp_timewait_kill(tw);
638 tcp_tw_put(tw);
639 killed++;
641 } else {
642 if (!adv) {
643 adv = 1;
644 tcp_twcal_jiffie = j;
645 tcp_twcal_hand = slot;
648 if (!hlist_empty(&tcp_twcal_row[slot])) {
649 mod_timer(&tcp_twcal_timer, j);
650 goto out;
653 j += (1<<TCP_TW_RECYCLE_TICK);
654 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
656 tcp_twcal_hand = -1;
658 out:
659 if ((tcp_tw_count -= killed) == 0)
660 del_timer(&tcp_tw_timer);
661 NET_ADD_STATS_BH(TimeWaitKilled, killed);
662 spin_unlock(&tw_death_lock);
665 /* This is not only more efficient than what we used to do, it eliminates
666 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
668 * Actually, we could lots of memory writes here. tp of listening
669 * socket contains all necessary default parameters.
671 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
673 /* allocate the newsk from the same slab of the master sock,
674 * if not, at sk_free time we'll try to free it from the wrong
675 * slabcache (i.e. is it TCPv4 or v6?) -acme */
676 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_slab);
678 if(newsk != NULL) {
679 struct tcp_opt *newtp;
680 struct sk_filter *filter;
682 memcpy(newsk, sk, sizeof(struct tcp_sock));
683 newsk->sk_state = TCP_SYN_RECV;
685 /* SANITY */
686 sk_node_init(&newsk->sk_node);
687 tcp_sk(newsk)->bind_hash = NULL;
689 /* Clone the TCP header template */
690 inet_sk(newsk)->dport = req->rmt_port;
692 sock_lock_init(newsk);
693 bh_lock_sock(newsk);
695 newsk->sk_dst_lock = RW_LOCK_UNLOCKED;
696 atomic_set(&newsk->sk_rmem_alloc, 0);
697 skb_queue_head_init(&newsk->sk_receive_queue);
698 atomic_set(&newsk->sk_wmem_alloc, 0);
699 skb_queue_head_init(&newsk->sk_write_queue);
700 atomic_set(&newsk->sk_omem_alloc, 0);
701 newsk->sk_wmem_queued = 0;
702 newsk->sk_forward_alloc = 0;
704 sock_reset_flag(newsk, SOCK_DONE);
705 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
706 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
707 newsk->sk_callback_lock = RW_LOCK_UNLOCKED;
708 skb_queue_head_init(&newsk->sk_error_queue);
709 newsk->sk_write_space = tcp_write_space;
711 if ((filter = newsk->sk_filter) != NULL)
712 sk_filter_charge(newsk, filter);
714 if (unlikely(xfrm_sk_clone_policy(newsk))) {
715 /* It is still raw copy of parent, so invalidate
716 * destructor and make plain sk_free() */
717 newsk->sk_destruct = NULL;
718 sk_free(newsk);
719 return NULL;
722 /* Now setup tcp_opt */
723 newtp = tcp_sk(newsk);
724 newtp->pred_flags = 0;
725 newtp->rcv_nxt = req->rcv_isn + 1;
726 newtp->snd_nxt = req->snt_isn + 1;
727 newtp->snd_una = req->snt_isn + 1;
728 newtp->snd_sml = req->snt_isn + 1;
730 tcp_prequeue_init(newtp);
732 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
734 newtp->retransmits = 0;
735 newtp->backoff = 0;
736 newtp->srtt = 0;
737 newtp->mdev = TCP_TIMEOUT_INIT;
738 newtp->rto = TCP_TIMEOUT_INIT;
740 newtp->packets_out = 0;
741 newtp->left_out = 0;
742 newtp->retrans_out = 0;
743 newtp->sacked_out = 0;
744 newtp->fackets_out = 0;
745 newtp->snd_ssthresh = 0x7fffffff;
747 /* So many TCP implementations out there (incorrectly) count the
748 * initial SYN frame in their delayed-ACK and congestion control
749 * algorithms that we must have the following bandaid to talk
750 * efficiently to them. -DaveM
752 newtp->snd_cwnd = 2;
753 newtp->snd_cwnd_cnt = 0;
755 newtp->frto_counter = 0;
756 newtp->frto_highmark = 0;
758 newtp->ca_state = TCP_CA_Open;
759 tcp_init_xmit_timers(newsk);
760 skb_queue_head_init(&newtp->out_of_order_queue);
761 newtp->send_head = NULL;
762 newtp->rcv_wup = req->rcv_isn + 1;
763 newtp->write_seq = req->snt_isn + 1;
764 newtp->pushed_seq = newtp->write_seq;
765 newtp->copied_seq = req->rcv_isn + 1;
767 newtp->saw_tstamp = 0;
769 newtp->dsack = 0;
770 newtp->eff_sacks = 0;
772 newtp->probes_out = 0;
773 newtp->num_sacks = 0;
774 newtp->urg_data = 0;
775 newtp->listen_opt = NULL;
776 newtp->accept_queue = newtp->accept_queue_tail = NULL;
777 /* Deinitialize syn_wait_lock to trap illegal accesses. */
778 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
780 /* Back to base struct sock members. */
781 newsk->sk_err = 0;
782 newsk->sk_priority = 0;
783 atomic_set(&newsk->sk_refcnt, 2);
784 #ifdef INET_REFCNT_DEBUG
785 atomic_inc(&inet_sock_nr);
786 #endif
787 atomic_inc(&tcp_sockets_allocated);
789 if (sock_flag(newsk, SOCK_KEEPOPEN))
790 tcp_reset_keepalive_timer(newsk,
791 keepalive_time_when(newtp));
792 newsk->sk_socket = NULL;
793 newsk->sk_sleep = NULL;
794 newsk->sk_owner = NULL;
796 newtp->tstamp_ok = req->tstamp_ok;
797 if((newtp->sack_ok = req->sack_ok) != 0) {
798 if (sysctl_tcp_fack)
799 newtp->sack_ok |= 2;
801 newtp->window_clamp = req->window_clamp;
802 newtp->rcv_ssthresh = req->rcv_wnd;
803 newtp->rcv_wnd = req->rcv_wnd;
804 newtp->wscale_ok = req->wscale_ok;
805 if (newtp->wscale_ok) {
806 newtp->snd_wscale = req->snd_wscale;
807 newtp->rcv_wscale = req->rcv_wscale;
808 } else {
809 newtp->snd_wscale = newtp->rcv_wscale = 0;
810 newtp->window_clamp = min(newtp->window_clamp, 65535U);
812 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
813 newtp->max_window = newtp->snd_wnd;
815 if (newtp->tstamp_ok) {
816 newtp->ts_recent = req->ts_recent;
817 newtp->ts_recent_stamp = xtime.tv_sec;
818 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
819 } else {
820 newtp->ts_recent_stamp = 0;
821 newtp->tcp_header_len = sizeof(struct tcphdr);
823 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
824 newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
825 newtp->mss_clamp = req->mss;
826 TCP_ECN_openreq_child(newtp, req);
827 if (newtp->ecn_flags&TCP_ECN_OK)
828 newsk->sk_no_largesend = 1;
830 TCP_INC_STATS_BH(TcpPassiveOpens);
832 return newsk;
836 * Process an incoming packet for SYN_RECV sockets represented
837 * as an open_request.
840 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
841 struct open_request *req,
842 struct open_request **prev)
844 struct tcphdr *th = skb->h.th;
845 struct tcp_opt *tp = tcp_sk(sk);
846 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
847 int paws_reject = 0;
848 struct tcp_opt ttp;
849 struct sock *child;
851 ttp.saw_tstamp = 0;
852 if (th->doff > (sizeof(struct tcphdr)>>2)) {
853 tcp_parse_options(skb, &ttp, 0);
855 if (ttp.saw_tstamp) {
856 ttp.ts_recent = req->ts_recent;
857 /* We do not store true stamp, but it is not required,
858 * it can be estimated (approximately)
859 * from another data.
861 ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
862 paws_reject = tcp_paws_check(&ttp, th->rst);
866 /* Check for pure retransmitted SYN. */
867 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
868 flg == TCP_FLAG_SYN &&
869 !paws_reject) {
871 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
872 * this case on figure 6 and figure 8, but formal
873 * protocol description says NOTHING.
874 * To be more exact, it says that we should send ACK,
875 * because this segment (at least, if it has no data)
876 * is out of window.
878 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
879 * describe SYN-RECV state. All the description
880 * is wrong, we cannot believe to it and should
881 * rely only on common sense and implementation
882 * experience.
884 * Enforce "SYN-ACK" according to figure 8, figure 6
885 * of RFC793, fixed by RFC1122.
887 req->class->rtx_syn_ack(sk, req, NULL);
888 return NULL;
891 /* Further reproduces section "SEGMENT ARRIVES"
892 for state SYN-RECEIVED of RFC793.
893 It is broken, however, it does not work only
894 when SYNs are crossed.
896 You would think that SYN crossing is impossible here, since
897 we should have a SYN_SENT socket (from connect()) on our end,
898 but this is not true if the crossed SYNs were sent to both
899 ends by a malicious third party. We must defend against this,
900 and to do that we first verify the ACK (as per RFC793, page
901 36) and reset if it is invalid. Is this a true full defense?
902 To convince ourselves, let us consider a way in which the ACK
903 test can still pass in this 'malicious crossed SYNs' case.
904 Malicious sender sends identical SYNs (and thus identical sequence
905 numbers) to both A and B:
907 A: gets SYN, seq=7
908 B: gets SYN, seq=7
910 By our good fortune, both A and B select the same initial
911 send sequence number of seven :-)
913 A: sends SYN|ACK, seq=7, ack_seq=8
914 B: sends SYN|ACK, seq=7, ack_seq=8
916 So we are now A eating this SYN|ACK, ACK test passes. So
917 does sequence test, SYN is truncated, and thus we consider
918 it a bare ACK.
920 If tp->defer_accept, we silently drop this bare ACK. Otherwise,
921 we create an established connection. Both ends (listening sockets)
922 accept the new incoming connection and try to talk to each other. 8-)
924 Note: This case is both harmless, and rare. Possibility is about the
925 same as us discovering intelligent life on another plant tomorrow.
927 But generally, we should (RFC lies!) to accept ACK
928 from SYNACK both here and in tcp_rcv_state_process().
929 tcp_rcv_state_process() does not, hence, we do not too.
931 Note that the case is absolutely generic:
932 we cannot optimize anything here without
933 violating protocol. All the checks must be made
934 before attempt to create socket.
937 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
938 * and the incoming segment acknowledges something not yet
939 * sent (the segment carries an unaccaptable ACK) ...
940 * a reset is sent."
942 * Invalid ACK: reset will be sent by listening socket
944 if ((flg & TCP_FLAG_ACK) &&
945 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
946 return sk;
948 /* Also, it would be not so bad idea to check rcv_tsecr, which
949 * is essentially ACK extension and too early or too late values
950 * should cause reset in unsynchronized states.
953 /* RFC793: "first check sequence number". */
955 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
956 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
957 /* Out of window: send ACK and drop. */
958 if (!(flg & TCP_FLAG_RST))
959 req->class->send_ack(skb, req);
960 if (paws_reject)
961 NET_INC_STATS_BH(PAWSEstabRejected);
962 return NULL;
965 /* In sequence, PAWS is OK. */
967 if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
968 req->ts_recent = ttp.rcv_tsval;
970 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
971 /* Truncate SYN, it is out of window starting
972 at req->rcv_isn+1. */
973 flg &= ~TCP_FLAG_SYN;
976 /* RFC793: "second check the RST bit" and
977 * "fourth, check the SYN bit"
979 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
980 goto embryonic_reset;
982 /* ACK sequence verified above, just make sure ACK is
983 * set. If ACK not set, just silently drop the packet.
985 if (!(flg & TCP_FLAG_ACK))
986 return NULL;
988 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
989 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
990 req->acked = 1;
991 return NULL;
994 /* OK, ACK is valid, create big socket and
995 * feed this segment to it. It will repeat all
996 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
997 * ESTABLISHED STATE. If it will be dropped after
998 * socket is created, wait for troubles.
1000 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1001 if (child == NULL)
1002 goto listen_overflow;
1004 sk_set_owner(child, sk->sk_owner);
1005 tcp_synq_unlink(tp, req, prev);
1006 tcp_synq_removed(sk, req);
1008 tcp_acceptq_queue(sk, req, child);
1009 return child;
1011 listen_overflow:
1012 if (!sysctl_tcp_abort_on_overflow) {
1013 req->acked = 1;
1014 return NULL;
1017 embryonic_reset:
1018 NET_INC_STATS_BH(EmbryonicRsts);
1019 if (!(flg & TCP_FLAG_RST))
1020 req->class->send_reset(skb);
1022 tcp_synq_drop(sk, req, prev);
1023 return NULL;
1027 * Queue segment on the new socket if the new socket is active,
1028 * otherwise we just shortcircuit this and continue with
1029 * the new socket.
1032 int tcp_child_process(struct sock *parent, struct sock *child,
1033 struct sk_buff *skb)
1035 int ret = 0;
1036 int state = child->sk_state;
1038 if (!sock_owned_by_user(child)) {
1039 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1041 /* Wakeup parent, send SIGIO */
1042 if (state == TCP_SYN_RECV && child->sk_state != state)
1043 parent->sk_data_ready(parent, 0);
1044 } else {
1045 /* Alas, it is possible again, because we do lookup
1046 * in main socket hash table and lock on listening
1047 * socket does not protect us more.
1049 sk_add_backlog(child, skb);
1052 bh_unlock_sock(child);
1053 sock_put(child);
1054 return ret;