allow coexistance of N build and AC build.
[tomato.git] / release / src-rt-6.x / linux / linux-2.6 / net / netfilter / nf_conntrack_core.c
blobc2937a1c9340c04b45d0b1b9a4a65b5dd763ec53
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
14 #include <linux/types.h>
15 #include <linux/netfilter.h>
16 #include <linux/module.h>
17 #include <linux/skbuff.h>
18 #include <linux/proc_fs.h>
19 #include <linux/vmalloc.h>
20 #include <linux/stddef.h>
21 #include <linux/slab.h>
22 #include <linux/random.h>
23 #include <linux/jhash.h>
24 #include <linux/err.h>
25 #include <linux/percpu.h>
26 #include <linux/moduleparam.h>
27 #include <linux/notifier.h>
28 #include <linux/kernel.h>
29 #include <linux/netdevice.h>
30 #include <linux/socket.h>
31 #include <linux/mm.h>
32 //#ifdef CONFIG_BCM_NAT
33 #include <net/ip.h>
34 //#endif
36 #include <net/netfilter/nf_conntrack.h>
37 #include <net/netfilter/nf_conntrack_l3proto.h>
38 #include <net/netfilter/nf_conntrack_l4proto.h>
39 #include <net/netfilter/nf_conntrack_expect.h>
40 #include <net/netfilter/nf_conntrack_helper.h>
41 #include <net/netfilter/nf_conntrack_core.h>
43 #define NF_CONNTRACK_VERSION "0.5.0"
45 #if 0
46 #define DEBUGP printk
47 #else
48 #define DEBUGP(format, args...)
49 #endif
51 DEFINE_RWLOCK(nf_conntrack_lock);
52 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
54 /* nf_conntrack_standalone needs this */
55 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
56 EXPORT_SYMBOL_GPL(nf_conntrack_count);
58 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack);
59 EXPORT_SYMBOL_GPL(nf_conntrack_destroyed);
61 unsigned int nf_conntrack_htable_size __read_mostly;
62 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
64 int nf_conntrack_max __read_mostly;
65 EXPORT_SYMBOL_GPL(nf_conntrack_max);
67 struct list_head *nf_conntrack_hash __read_mostly;
68 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
70 struct nf_conn nf_conntrack_untracked __read_mostly;
71 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
73 unsigned int nf_ct_log_invalid __read_mostly;
74 LIST_HEAD(unconfirmed);
75 static int nf_conntrack_vmalloc __read_mostly;
77 static unsigned int nf_conntrack_next_id;
79 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
80 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
82 #ifdef HNDCTF
83 extern int ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout);
84 #endif /* HNDCTF */
86 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
87 extern int ipv4_conntrack_fastnat;
89 typedef int (*bcmNatBindHook)(struct nf_conn *ct,
90 enum ip_conntrack_info ctinfo,
91 struct sk_buff *skb,
92 struct nf_conntrack_l3proto *l3proto,
93 struct nf_conntrack_l4proto *l4proto);
94 typedef int (*bcmNatHitHook)(struct sk_buff *skb);
96 bcmNatBindHook bcm_nat_bind_hook = NULL;
97 bcmNatHitHook bcm_nat_hit_hook = NULL;
98 #ifdef CONFIG_BCM_NAT_MODULE
99 EXPORT_SYMBOL(bcm_nat_hit_hook);
100 EXPORT_SYMBOL(bcm_nat_bind_hook);
101 #endif
102 #endif
105 * This scheme offers various size of "struct nf_conn" dependent on
106 * features(helper, nat, ...)
109 #define NF_CT_FEATURES_NAMELEN 256
110 static struct {
111 /* name of slab cache. printed in /proc/slabinfo */
112 char *name;
114 /* size of slab cache */
115 size_t size;
117 /* slab cache pointer */
118 struct kmem_cache *cachep;
120 /* allocated slab cache + modules which uses this slab cache */
121 int use;
123 } nf_ct_cache[NF_CT_F_NUM];
125 /* protect members of nf_ct_cache except of "use" */
126 DEFINE_RWLOCK(nf_ct_cache_lock);
128 /* This avoids calling kmem_cache_create() with same name simultaneously */
129 static DEFINE_MUTEX(nf_ct_cache_mutex);
131 static unsigned int nf_conntrack_hash_rnd __read_mostly;
133 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
134 unsigned int size, unsigned int rnd)
136 unsigned int n;
137 u_int32_t h;
139 /* The direction must be ignored, so we hash everything up to the
140 * destination ports (which is a multiple of 4) and treat the last
141 * three bytes manually.
143 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
144 h = jhash2((u32 *)tuple, n,
145 rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
146 tuple->dst.protonum));
148 return ((u64)h * size) >> 32;
151 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
153 return __hash_conntrack(tuple, nf_conntrack_htable_size,
154 nf_conntrack_hash_rnd);
157 int nf_conntrack_register_cache(u_int32_t features, const char *name,
158 size_t size)
160 int ret = 0;
161 char *cache_name;
162 struct kmem_cache *cachep;
164 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
165 features, name, size);
167 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
168 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
169 features);
170 return -EINVAL;
173 mutex_lock(&nf_ct_cache_mutex);
175 write_lock_bh(&nf_ct_cache_lock);
176 /* e.g: multiple helpers are loaded */
177 if (nf_ct_cache[features].use > 0) {
178 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
179 if ((!strncmp(nf_ct_cache[features].name, name,
180 NF_CT_FEATURES_NAMELEN))
181 && nf_ct_cache[features].size == size) {
182 DEBUGP("nf_conntrack_register_cache: reusing.\n");
183 nf_ct_cache[features].use++;
184 ret = 0;
185 } else
186 ret = -EBUSY;
188 write_unlock_bh(&nf_ct_cache_lock);
189 mutex_unlock(&nf_ct_cache_mutex);
190 return ret;
192 write_unlock_bh(&nf_ct_cache_lock);
195 * The memory space for name of slab cache must be alive until
196 * cache is destroyed.
198 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
199 if (cache_name == NULL) {
200 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
201 ret = -ENOMEM;
202 goto out_up_mutex;
205 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
206 >= NF_CT_FEATURES_NAMELEN) {
207 printk("nf_conntrack_register_cache: name too long\n");
208 ret = -EINVAL;
209 goto out_free_name;
212 cachep = kmem_cache_create(cache_name, size, 0, 0,
213 NULL, NULL);
214 if (!cachep) {
215 printk("nf_conntrack_register_cache: Can't create slab cache "
216 "for the features = 0x%x\n", features);
217 ret = -ENOMEM;
218 goto out_free_name;
221 write_lock_bh(&nf_ct_cache_lock);
222 nf_ct_cache[features].use = 1;
223 nf_ct_cache[features].size = size;
224 nf_ct_cache[features].cachep = cachep;
225 nf_ct_cache[features].name = cache_name;
226 write_unlock_bh(&nf_ct_cache_lock);
228 goto out_up_mutex;
230 out_free_name:
231 kfree(cache_name);
232 out_up_mutex:
233 mutex_unlock(&nf_ct_cache_mutex);
234 return ret;
236 EXPORT_SYMBOL_GPL(nf_conntrack_register_cache);
238 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
239 void nf_conntrack_unregister_cache(u_int32_t features)
241 struct kmem_cache *cachep;
242 char *name;
245 * This assures that kmem_cache_create() isn't called before destroying
246 * slab cache.
248 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
249 mutex_lock(&nf_ct_cache_mutex);
251 write_lock_bh(&nf_ct_cache_lock);
252 if (--nf_ct_cache[features].use > 0) {
253 write_unlock_bh(&nf_ct_cache_lock);
254 mutex_unlock(&nf_ct_cache_mutex);
255 return;
257 cachep = nf_ct_cache[features].cachep;
258 name = nf_ct_cache[features].name;
259 nf_ct_cache[features].cachep = NULL;
260 nf_ct_cache[features].name = NULL;
261 nf_ct_cache[features].size = 0;
262 write_unlock_bh(&nf_ct_cache_lock);
264 synchronize_net();
266 kmem_cache_destroy(cachep);
267 kfree(name);
269 mutex_unlock(&nf_ct_cache_mutex);
271 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_cache);
274 nf_ct_get_tuple(const struct sk_buff *skb,
275 unsigned int nhoff,
276 unsigned int dataoff,
277 u_int16_t l3num,
278 u_int8_t protonum,
279 struct nf_conntrack_tuple *tuple,
280 const struct nf_conntrack_l3proto *l3proto,
281 const struct nf_conntrack_l4proto *l4proto)
283 memset(tuple, 0, sizeof(*tuple));
285 tuple->src.l3num = l3num;
286 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
287 return 0;
288 tuple->dst.protonum = protonum;
289 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
291 return l4proto->pkt_to_tuple(skb, dataoff, tuple);
293 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
296 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
297 const struct nf_conntrack_tuple *orig,
298 const struct nf_conntrack_l3proto *l3proto,
299 const struct nf_conntrack_l4proto *l4proto)
301 memset(inverse, 0, sizeof(*inverse));
303 inverse->src.l3num = orig->src.l3num;
304 if (l3proto->invert_tuple(inverse, orig) == 0)
305 return 0;
307 inverse->dst.dir = !orig->dst.dir;
309 inverse->dst.protonum = orig->dst.protonum;
310 return l4proto->invert_tuple(inverse, orig);
312 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
314 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
315 #ifndef CONFIG_BCM_NAT_MODULE
316 inline
317 #endif
318 int bcm_nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
319 const struct nf_conntrack_tuple *orig,
320 const struct nf_conntrack_l3proto *l3proto,
321 const struct nf_conntrack_l4proto *l4proto)
323 return nf_ct_invert_tuple(inverse, orig, l3proto,l4proto);
325 #ifdef CONFIG_BCM_NAT_MODULE
326 EXPORT_SYMBOL(bcm_nf_ct_invert_tuple);
327 #endif
328 #endif
330 static void
331 clean_from_lists(struct nf_conn *ct)
333 DEBUGP("clean_from_lists(%p)\n", ct);
334 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
335 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
337 /* Destroy all pending expectations */
338 nf_ct_remove_expectations(ct);
341 static void
342 destroy_conntrack(struct nf_conntrack *nfct)
344 struct nf_conn *ct = (struct nf_conn *)nfct;
345 struct nf_conntrack_l4proto *l4proto;
346 typeof(nf_conntrack_destroyed) destroyed;
348 DEBUGP("destroy_conntrack(%p)\n", ct);
349 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
350 NF_CT_ASSERT(!timer_pending(&ct->timeout));
352 #ifdef HNDCTF
353 ip_conntrack_ipct_delete(ct, 0);
354 #endif /* HNDCTF*/
356 nf_conntrack_event(IPCT_DESTROY, ct);
357 set_bit(IPS_DYING_BIT, &ct->status);
359 /* To make sure we don't get any weird locking issues here:
360 * destroy_conntrack() MUST NOT be called with a write lock
361 * to nf_conntrack_lock!!! -HW */
362 rcu_read_lock();
363 l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
364 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
365 if (l4proto && l4proto->destroy)
366 l4proto->destroy(ct);
368 destroyed = rcu_dereference(nf_conntrack_destroyed);
369 if (destroyed)
370 destroyed(ct);
372 rcu_read_unlock();
374 write_lock_bh(&nf_conntrack_lock);
375 /* Expectations will have been removed in clean_from_lists,
376 * except TFTP can create an expectation on the first packet,
377 * before connection is in the list, so we need to clean here,
378 * too. */
379 nf_ct_remove_expectations(ct);
381 #if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE)
382 if(ct->layer7.app_proto)
383 kfree(ct->layer7.app_proto);
384 if(ct->layer7.app_data)
385 kfree(ct->layer7.app_data);
386 #endif
389 /* We overload first tuple to link into unconfirmed list. */
390 if (!nf_ct_is_confirmed(ct)) {
391 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
392 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
395 NF_CT_STAT_INC(delete);
396 write_unlock_bh(&nf_conntrack_lock);
398 if (ct->master)
399 nf_ct_put(ct->master);
401 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
402 nf_conntrack_free(ct);
405 static void death_by_timeout(unsigned long ul_conntrack)
407 struct nf_conn *ct = (void *)ul_conntrack;
408 struct nf_conn_help *help = nfct_help(ct);
409 struct nf_conntrack_helper *helper;
411 #ifdef HNDCTF
412 /* If negative error is returned it means the entry hasn't
413 * timed out yet.
415 if (ip_conntrack_ipct_delete(ct, jiffies >= ct->timeout.expires ? 1 : 0) != 0)
416 return;
417 #endif /* HNDCTF */
419 if (help) {
420 rcu_read_lock();
421 helper = rcu_dereference(help->helper);
422 if (helper && helper->destroy)
423 helper->destroy(ct);
424 rcu_read_unlock();
427 write_lock_bh(&nf_conntrack_lock);
428 /* Inside lock so preempt is disabled on module removal path.
429 * Otherwise we can get spurious warnings. */
430 NF_CT_STAT_INC(delete_list);
431 clean_from_lists(ct);
432 write_unlock_bh(&nf_conntrack_lock);
433 nf_ct_put(ct);
436 struct nf_conntrack_tuple_hash *
437 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple)
439 struct nf_conntrack_tuple_hash *h;
440 unsigned int hash = hash_conntrack(tuple);
442 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
443 if (nf_ct_tuple_equal(tuple, &h->tuple)) {
444 NF_CT_STAT_INC(found);
445 return h;
447 NF_CT_STAT_INC(searched);
450 return NULL;
452 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
454 /* Find a connection corresponding to a tuple. */
455 struct nf_conntrack_tuple_hash *
456 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
457 const struct nf_conn *ignored_conntrack)
459 struct nf_conntrack_tuple_hash *h;
460 struct nf_conn *ct;
462 read_lock_bh(&nf_conntrack_lock);
463 h = __nf_conntrack_find(tuple);
464 if (h) {
465 ct = nf_ct_tuplehash_to_ctrack(h);
466 if (unlikely(nf_ct_is_dying(ct) ||
467 !atomic_inc_not_zero(&ct->ct_general.use)))
468 h = NULL;
470 read_unlock_bh(&nf_conntrack_lock);
472 return h;
474 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
476 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
477 unsigned int hash,
478 unsigned int repl_hash)
480 ct->id = ++nf_conntrack_next_id;
481 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
482 &nf_conntrack_hash[hash]);
483 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
484 &nf_conntrack_hash[repl_hash]);
487 void nf_conntrack_hash_insert(struct nf_conn *ct)
489 unsigned int hash, repl_hash;
491 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
492 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
494 write_lock_bh(&nf_conntrack_lock);
495 __nf_conntrack_hash_insert(ct, hash, repl_hash);
496 write_unlock_bh(&nf_conntrack_lock);
498 EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
500 /* Confirm a connection given skb; places it in hash table */
502 __nf_conntrack_confirm(struct sk_buff *skb)
504 unsigned int hash, repl_hash;
505 struct nf_conntrack_tuple_hash *h;
506 struct nf_conn *ct;
507 struct nf_conn_help *help;
508 enum ip_conntrack_info ctinfo;
510 ct = nf_ct_get(skb, &ctinfo);
512 /* ipt_REJECT uses nf_conntrack_attach to attach related
513 ICMP/TCP RST packets in other direction. Actual packet
514 which created connection will be IP_CT_NEW or for an
515 expected connection, IP_CT_RELATED. */
516 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
517 return NF_ACCEPT;
519 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
520 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
522 /* We're not in hash table, and we refuse to set up related
523 connections for unconfirmed conns. But packet copies and
524 REJECT will give spurious warnings here. */
525 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
527 /* No external references means noone else could have
528 confirmed us. */
529 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
530 DEBUGP("Confirming conntrack %p\n", ct);
532 write_lock_bh(&nf_conntrack_lock);
534 /* We have to check the DYING flag inside the lock to prevent
535 a race against nf_ct_get_next_corpse() possibly called from
536 user context, else we insert an already 'dead' hash, blocking
537 further use of that particular connection -JM */
539 if (unlikely(nf_ct_is_dying(ct))) {
540 write_unlock_bh(&nf_conntrack_lock);
541 return NF_ACCEPT;
544 /* See if there's one in the list already, including reverse:
545 NAT could have grabbed it without realizing, since we're
546 not in the hash. If there is, we lost race. */
547 list_for_each_entry(h, &nf_conntrack_hash[hash], list)
548 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
549 &h->tuple))
550 goto out;
551 list_for_each_entry(h, &nf_conntrack_hash[repl_hash], list)
552 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
553 &h->tuple))
554 goto out;
556 /* Remove from unconfirmed list */
557 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
559 __nf_conntrack_hash_insert(ct, hash, repl_hash);
560 /* Timer relative to confirmation time, not original
561 setting time, otherwise we'd get timer wrap in
562 weird delay cases. */
563 ct->timeout.expires += jiffies;
564 add_timer(&ct->timeout);
565 atomic_inc(&ct->ct_general.use);
566 set_bit(IPS_CONFIRMED_BIT, &ct->status);
567 NF_CT_STAT_INC(insert);
568 write_unlock_bh(&nf_conntrack_lock);
569 help = nfct_help(ct);
570 if (help && help->helper)
571 nf_conntrack_event_cache(IPCT_HELPER, skb);
572 #ifdef CONFIG_NF_NAT_NEEDED
573 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
574 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
575 nf_conntrack_event_cache(IPCT_NATINFO, skb);
576 #endif
577 nf_conntrack_event_cache(master_ct(ct) ?
578 IPCT_RELATED : IPCT_NEW, skb);
579 return NF_ACCEPT;
581 out:
582 NF_CT_STAT_INC(insert_failed);
583 write_unlock_bh(&nf_conntrack_lock);
584 return NF_DROP;
586 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
588 /* Returns true if a connection correspondings to the tuple (required
589 for NAT). */
591 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
592 const struct nf_conn *ignored_conntrack)
594 struct nf_conntrack_tuple_hash *h;
595 unsigned int hash = hash_conntrack(tuple);
597 read_lock_bh(&nf_conntrack_lock);
598 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
599 if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
600 nf_ct_tuple_equal(tuple, &h->tuple)) {
601 NF_CT_STAT_INC(found);
602 read_unlock_bh(&nf_conntrack_lock);
603 return 1;
605 NF_CT_STAT_INC(searched);
607 read_unlock_bh(&nf_conntrack_lock);
609 return 0;
611 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
613 /* There's a small race here where we may free a just-assured
614 connection. Too bad: we're in trouble anyway. */
615 static noinline int early_drop(struct list_head *chain)
617 /* Traverse backwards: gives us oldest, which is roughly LRU */
618 struct nf_conntrack_tuple_hash *h;
619 struct nf_conn *ct = NULL, *tmp;
620 int dropped = 0;
622 read_lock_bh(&nf_conntrack_lock);
623 list_for_each_entry_reverse(h, chain, list) {
624 tmp = nf_ct_tuplehash_to_ctrack(h);
625 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
626 ct = tmp;
627 atomic_inc(&ct->ct_general.use);
628 break;
631 read_unlock_bh(&nf_conntrack_lock);
633 if (!ct)
634 return dropped;
636 #ifdef HNDCTF
637 ip_conntrack_ipct_delete(ct, 0);
638 #endif /* HNDCTF */
640 if (del_timer(&ct->timeout)) {
641 death_by_timeout((unsigned long)ct);
642 dropped = 1;
643 NF_CT_STAT_INC_ATOMIC(early_drop);
645 nf_ct_put(ct);
646 return dropped;
649 static struct nf_conn *
650 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
651 const struct nf_conntrack_tuple *repl,
652 const struct nf_conntrack_l3proto *l3proto,
653 u_int32_t features)
655 struct nf_conn *conntrack = NULL;
656 struct nf_conntrack_helper *helper;
658 if (unlikely(!nf_conntrack_hash_rnd)) {
659 unsigned int rand;
662 * Why not initialize nf_conntrack_rnd in a "init()" function ?
663 * Because there isn't enough entropy when system initializing,
664 * and we initialize it as late as possible.
666 do {
667 get_random_bytes(&rand, sizeof(rand));
668 } while (!rand);
669 cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
672 /* We don't want any race condition at early drop stage */
673 atomic_inc(&nf_conntrack_count);
675 if (nf_conntrack_max &&
676 unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) {
677 unsigned int hash = hash_conntrack(orig);
678 /* Try dropping from this hash chain. */
679 if (!early_drop(&nf_conntrack_hash[hash])) {
680 atomic_dec(&nf_conntrack_count);
681 if (net_ratelimit())
682 printk(KERN_WARNING
683 "nf_conntrack: table full, dropping"
684 " packet.\n");
685 return ERR_PTR(-ENOMEM);
689 /* find features needed by this conntrack. */
690 features |= l3proto->get_features(orig);
692 /* FIXME: protect helper list per RCU */
693 read_lock_bh(&nf_conntrack_lock);
694 helper = __nf_ct_helper_find(repl);
695 /* NAT might want to assign a helper later */
696 if (helper || features & NF_CT_F_NAT)
697 features |= NF_CT_F_HELP;
698 read_unlock_bh(&nf_conntrack_lock);
700 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
702 read_lock_bh(&nf_ct_cache_lock);
704 if (unlikely(!nf_ct_cache[features].use)) {
705 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
706 features);
707 goto out;
710 conntrack = kmem_cache_zalloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
711 if (conntrack == NULL) {
712 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
713 goto out;
716 conntrack->features = features;
717 atomic_set(&conntrack->ct_general.use, 1);
718 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
719 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
720 /* Don't set timer yet: wait for confirmation */
721 setup_timer(&conntrack->timeout, death_by_timeout,
722 (unsigned long)conntrack);
723 read_unlock_bh(&nf_ct_cache_lock);
725 return conntrack;
726 out:
727 read_unlock_bh(&nf_ct_cache_lock);
728 atomic_dec(&nf_conntrack_count);
729 return conntrack;
732 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
733 const struct nf_conntrack_tuple *repl)
735 struct nf_conntrack_l3proto *l3proto;
736 struct nf_conn *ct;
738 rcu_read_lock();
739 l3proto = __nf_ct_l3proto_find(orig->src.l3num);
740 ct = __nf_conntrack_alloc(orig, repl, l3proto, 0);
741 rcu_read_unlock();
743 return ct;
745 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
747 void nf_conntrack_free(struct nf_conn *conntrack)
749 u_int32_t features = conntrack->features;
750 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
751 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
752 conntrack);
753 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
754 atomic_dec(&nf_conntrack_count);
756 EXPORT_SYMBOL_GPL(nf_conntrack_free);
758 /* Allocate a new conntrack: we return -ENOMEM if classification
759 failed due to stress. Otherwise it really is unclassifiable. */
760 static struct nf_conntrack_tuple_hash *
761 init_conntrack(const struct nf_conntrack_tuple *tuple,
762 struct nf_conntrack_l3proto *l3proto,
763 struct nf_conntrack_l4proto *l4proto,
764 struct sk_buff *skb,
765 unsigned int dataoff)
767 struct nf_conn *conntrack;
768 struct nf_conn_help *help;
769 struct nf_conntrack_tuple repl_tuple;
770 struct nf_conntrack_expect *exp;
771 u_int32_t features = 0;
773 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
774 DEBUGP("Can't invert tuple.\n");
775 return NULL;
778 read_lock_bh(&nf_conntrack_lock);
779 exp = __nf_conntrack_expect_find(tuple);
780 if (exp && exp->helper)
781 features = NF_CT_F_HELP;
782 read_unlock_bh(&nf_conntrack_lock);
784 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto, features);
785 if (conntrack == NULL || IS_ERR(conntrack)) {
786 DEBUGP("Can't allocate conntrack.\n");
787 return (struct nf_conntrack_tuple_hash *)conntrack;
790 if (!l4proto->new(conntrack, skb, dataoff)) {
791 nf_conntrack_free(conntrack);
792 DEBUGP("init conntrack: can't track with proto module\n");
793 return NULL;
796 write_lock_bh(&nf_conntrack_lock);
798 exp = find_expectation(tuple);
800 help = nfct_help(conntrack);
801 if (exp) {
802 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
803 conntrack, exp);
804 /* Welcome, Mr. Bond. We've been expecting you... */
805 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
806 conntrack->master = exp->master;
807 if (exp->helper)
808 rcu_assign_pointer(help->helper, exp->helper);
809 #ifdef CONFIG_NF_CONNTRACK_MARK
810 conntrack->mark = exp->master->mark;
811 #endif
812 #ifdef CONFIG_NF_CONNTRACK_SECMARK
813 conntrack->secmark = exp->master->secmark;
814 #endif
815 nf_conntrack_get(&conntrack->master->ct_general);
816 NF_CT_STAT_INC(expect_new);
817 } else {
818 if (help) {
819 /* not in hash table yet, so not strictly necessary */
820 rcu_assign_pointer(help->helper,
821 __nf_ct_helper_find(&repl_tuple));
823 NF_CT_STAT_INC(new);
826 /* Overload tuple linked list to put us in unconfirmed list. */
827 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
829 write_unlock_bh(&nf_conntrack_lock);
831 if (exp) {
832 if (exp->expectfn)
833 exp->expectfn(conntrack, exp);
834 nf_conntrack_expect_put(exp);
837 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
840 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
841 static inline struct nf_conn *
842 resolve_normal_ct(struct sk_buff *skb,
843 unsigned int dataoff,
844 u_int16_t l3num,
845 u_int8_t protonum,
846 struct nf_conntrack_l3proto *l3proto,
847 struct nf_conntrack_l4proto *l4proto,
848 int *set_reply,
849 enum ip_conntrack_info *ctinfo)
851 struct nf_conntrack_tuple tuple;
852 struct nf_conntrack_tuple_hash *h;
853 struct nf_conn *ct;
855 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
856 dataoff, l3num, protonum, &tuple, l3proto,
857 l4proto)) {
858 DEBUGP("resolve_normal_ct: Can't get tuple\n");
859 return NULL;
862 /* look for tuple match */
863 h = nf_conntrack_find_get(&tuple, NULL);
864 if (!h) {
865 h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff);
866 if (!h)
867 return NULL;
868 if (IS_ERR(h))
869 return (void *)h;
871 ct = nf_ct_tuplehash_to_ctrack(h);
873 /* It exists; we have (non-exclusive) reference. */
874 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
875 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
876 /* Please set reply bit if this packet OK */
877 *set_reply = 1;
878 } else {
879 /* Once we've had two way comms, always ESTABLISHED. */
880 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
881 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
882 *ctinfo = IP_CT_ESTABLISHED;
883 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
884 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
885 *ctinfo = IP_CT_RELATED;
886 } else {
887 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
888 *ctinfo = IP_CT_NEW;
890 *set_reply = 0;
892 skb->nfct = &ct->ct_general;
893 skb->nfctinfo = *ctinfo;
894 return ct;
897 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
898 extern int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user);
899 #endif
901 unsigned int
902 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb)
904 struct nf_conn *ct;
905 enum ip_conntrack_info ctinfo;
906 struct nf_conntrack_l3proto *l3proto;
907 struct nf_conntrack_l4proto *l4proto;
908 unsigned int dataoff;
909 u_int8_t protonum;
910 int set_reply = 0;
911 int ret;
912 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
913 struct nf_conn_nat *nat = NULL;
914 #endif
916 /* Previously seen (loopback or untracked)? Ignore. */
917 if (skb->nfct) {
918 NF_CT_STAT_INC_ATOMIC(ignore);
919 return NF_ACCEPT;
922 /* rcu_read_lock()ed by nf_hook_slow */
923 l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
925 if ((ret = l3proto->prepare(skb, hooknum, &dataoff, &protonum)) <= 0) {
926 DEBUGP("not prepared to track yet or error occured\n");
927 return -ret;
930 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
931 if (pf == PF_INET && ipv4_conntrack_fastnat) {
932 /* Gather fragments. */
933 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
934 if (nf_ct_ipv4_gather_frags(skb,
935 hooknum == NF_IP_PRE_ROUTING ?
936 IP_DEFRAG_CONNTRACK_IN :
937 IP_DEFRAG_CONNTRACK_OUT))
938 return NF_STOLEN;
941 #endif
943 l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum);
945 /* It may be an special packet, error, unclean...
946 * inverse of the return code tells to the netfilter
947 * core what to do with the packet. */
948 if (l4proto->error != NULL &&
949 (ret = l4proto->error(skb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
950 NF_CT_STAT_INC_ATOMIC(error);
951 NF_CT_STAT_INC_ATOMIC(invalid);
952 return -ret;
955 ct = resolve_normal_ct(skb, dataoff, pf, protonum, l3proto, l4proto,
956 &set_reply, &ctinfo);
957 if (!ct) {
958 /* Not valid part of a connection */
959 NF_CT_STAT_INC_ATOMIC(invalid);
960 return NF_ACCEPT;
963 if (IS_ERR(ct)) {
964 /* Too stressed to deal. */
965 NF_CT_STAT_INC_ATOMIC(drop);
966 return NF_DROP;
969 NF_CT_ASSERT(skb->nfct);
971 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
972 if (ret <= 0) {
973 /* Invalid: inverse of the return code tells
974 * the netfilter core what to do */
975 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
976 nf_conntrack_put(skb->nfct);
977 skb->nfct = NULL;
978 NF_CT_STAT_INC_ATOMIC(invalid);
979 if (ret == -NF_DROP)
980 NF_CT_STAT_INC_ATOMIC(drop);
981 return -ret;
984 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
985 if (pf == PF_INET)
986 nat = nfct_nat(ct);
988 if (nat && hooknum == NF_IP_PRE_ROUTING &&
989 ipv4_conntrack_fastnat && bcm_nat_bind_hook) {
990 struct nf_conn_help *help = nfct_help(ct);
992 if (!(nat->info.nat_type & BCM_FASTNAT_DENY) &&
993 !help->helper &&
994 (ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_IS_REPLY) &&
995 (protonum == IPPROTO_TCP || protonum == IPPROTO_UDP)) {
996 struct nf_conntrack_tuple *t1, *t2;
998 t1 = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
999 t2 = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
1000 if (!(t1->dst.u3.ip == t2->src.u3.ip &&
1001 t1->src.u3.ip == t2->dst.u3.ip &&
1002 t1->dst.u.all == t2->src.u.all &&
1003 t1->src.u.all == t2->dst.u.all)) {
1004 ret = bcm_nat_bind_hook(ct, ctinfo, skb, l3proto, l4proto);
1008 #endif
1010 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1011 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
1012 if (nat && hooknum == NF_IP_LOCAL_OUT)
1013 nat->info.nat_type |= BCM_FASTNAT_DENY;
1014 #endif
1015 nf_conntrack_event_cache(IPCT_STATUS, skb);
1017 return ret;
1019 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1021 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1022 const struct nf_conntrack_tuple *orig)
1024 int ret;
1026 rcu_read_lock();
1027 ret = nf_ct_invert_tuple(inverse, orig,
1028 __nf_ct_l3proto_find(orig->src.l3num),
1029 __nf_ct_l4proto_find(orig->src.l3num,
1030 orig->dst.protonum));
1031 rcu_read_unlock();
1032 return ret;
1034 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1036 static const u8 expecting_none[NF_CT_MAX_EXPECT_CLASSES] = { 0 };
1037 static inline int nfct_help_expecting(struct nf_conn_help *help)
1039 return (memcmp(&(help->expecting), &expecting_none, sizeof(help->expecting)) != 0);
1042 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1043 implicitly racy: see __nf_conntrack_confirm */
1044 void nf_conntrack_alter_reply(struct nf_conn *ct,
1045 const struct nf_conntrack_tuple *newreply)
1047 struct nf_conn_help *help = nfct_help(ct);
1049 write_lock_bh(&nf_conntrack_lock);
1050 /* Should be unconfirmed, so not in hash table yet */
1051 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1053 DEBUGP("Altering reply tuple of %p to ", ct);
1054 NF_CT_DUMP_TUPLE(newreply);
1056 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1057 if (!ct->master && help && nfct_help_expecting(help) == 0) {
1058 struct nf_conntrack_helper *helper;
1059 helper = __nf_ct_helper_find(newreply);
1060 if (helper)
1061 memset(&help->help, 0, sizeof(help->help));
1062 /* not in hash table yet, so not strictly necessary */
1063 rcu_assign_pointer(help->helper, helper);
1065 write_unlock_bh(&nf_conntrack_lock);
1067 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1069 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1070 void __nf_ct_refresh_acct(struct nf_conn *ct,
1071 enum ip_conntrack_info ctinfo,
1072 const struct sk_buff *skb,
1073 unsigned long extra_jiffies,
1074 int do_acct)
1076 int event = 0;
1078 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1079 NF_CT_ASSERT(skb);
1081 write_lock_bh(&nf_conntrack_lock);
1083 /* Only update if this is not a fixed timeout */
1084 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1085 goto acct;
1087 /* If not in hash table, timer will not be active yet */
1088 if (!nf_ct_is_confirmed(ct)) {
1089 #ifdef HNDCTF
1090 ct->expire_jiffies = extra_jiffies;
1091 #endif /* HNDCTF */
1092 ct->timeout.expires = extra_jiffies;
1093 event = IPCT_REFRESH;
1094 } else {
1095 unsigned long newtime = jiffies + extra_jiffies;
1097 /* Only update the timeout if the new timeout is at least
1098 HZ jiffies from the old timeout. Need del_timer for race
1099 avoidance (may already be dying). */
1100 if (newtime - ct->timeout.expires >= HZ
1101 && del_timer(&ct->timeout)) {
1102 #ifdef HNDCTF
1103 ct->expire_jiffies = extra_jiffies;
1104 #endif /* HNDCTF */
1105 ct->timeout.expires = newtime;
1106 add_timer(&ct->timeout);
1107 event = IPCT_REFRESH;
1111 acct:
1112 #ifdef CONFIG_NF_CT_ACCT
1113 if (do_acct) {
1114 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1115 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1116 skb->len - skb_network_offset(skb);
1118 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1119 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1120 event |= IPCT_COUNTER_FILLING;
1122 #endif
1124 write_unlock_bh(&nf_conntrack_lock);
1126 /* must be unlocked when calling event cache */
1127 if (event)
1128 nf_conntrack_event_cache(event, skb);
1130 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1132 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1134 #include <linux/netfilter/nfnetlink.h>
1135 #include <linux/netfilter/nfnetlink_conntrack.h>
1136 #include <linux/mutex.h>
1139 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1140 * in ip_conntrack_core, since we don't want the protocols to autoload
1141 * or depend on ctnetlink */
1142 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1143 const struct nf_conntrack_tuple *tuple)
1145 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1146 &tuple->src.u.tcp.port);
1147 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1148 &tuple->dst.u.tcp.port);
1149 return 0;
1151 nfattr_failure:
1152 return -1;
1154 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nfattr);
1156 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1157 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
1158 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
1161 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1162 struct nf_conntrack_tuple *t)
1164 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1165 return -EINVAL;
1167 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1168 return -EINVAL;
1170 t->src.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1171 t->dst.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1173 return 0;
1175 EXPORT_SYMBOL_GPL(nf_ct_port_nfattr_to_tuple);
1176 #endif
1178 /* Used by ipt_REJECT and ip6t_REJECT. */
1179 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1181 struct nf_conn *ct;
1182 enum ip_conntrack_info ctinfo;
1184 /* This ICMP is in reverse direction to the packet which caused it */
1185 ct = nf_ct_get(skb, &ctinfo);
1186 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1187 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1188 else
1189 ctinfo = IP_CT_RELATED;
1191 /* Attach to new skbuff, and increment count */
1192 nskb->nfct = &ct->ct_general;
1193 nskb->nfctinfo = ctinfo;
1194 nf_conntrack_get(nskb->nfct);
1196 EXPORT_SYMBOL_GPL(__nf_conntrack_attach);
1198 static inline int
1199 do_iter(const struct nf_conntrack_tuple_hash *i,
1200 int (*iter)(struct nf_conn *i, void *data),
1201 void *data)
1203 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1206 /* Bring out ya dead! */
1207 static struct nf_conn *
1208 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1209 void *data, unsigned int *bucket)
1211 struct nf_conntrack_tuple_hash *h;
1212 struct nf_conn *ct;
1214 write_lock_bh(&nf_conntrack_lock);
1215 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1216 list_for_each_entry(h, &nf_conntrack_hash[*bucket], list) {
1217 ct = nf_ct_tuplehash_to_ctrack(h);
1218 if (iter(ct, data))
1219 goto found;
1222 list_for_each_entry(h, &unconfirmed, list) {
1223 ct = nf_ct_tuplehash_to_ctrack(h);
1224 if (iter(ct, data))
1225 set_bit(IPS_DYING_BIT, &ct->status);
1227 write_unlock_bh(&nf_conntrack_lock);
1228 return NULL;
1229 found:
1230 atomic_inc(&ct->ct_general.use);
1231 write_unlock_bh(&nf_conntrack_lock);
1232 return ct;
1235 void
1236 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1238 struct nf_conn *ct;
1239 unsigned int bucket = 0;
1241 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1242 #ifdef HNDCTF
1243 ip_conntrack_ipct_delete(ct, 0);
1244 #endif /* HNDCTF */
1245 /* Time to push up daises... */
1246 if (del_timer(&ct->timeout))
1247 death_by_timeout((unsigned long)ct);
1248 /* ... else the timer will get him soon. */
1250 nf_ct_put(ct);
1253 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1255 static int kill_all(struct nf_conn *i, void *data)
1257 return 1;
1260 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1262 if (vmalloced)
1263 vfree(hash);
1264 else
1265 free_pages((unsigned long)hash,
1266 get_order(sizeof(struct list_head) * size));
1269 void nf_conntrack_flush(void)
1271 nf_ct_iterate_cleanup(kill_all, NULL);
1273 EXPORT_SYMBOL_GPL(nf_conntrack_flush);
1275 /* Mishearing the voices in his head, our hero wonders how he's
1276 supposed to kill the mall. */
1277 void nf_conntrack_cleanup(void)
1279 int i;
1281 rcu_assign_pointer(ip_ct_attach, NULL);
1283 /* This makes sure all current packets have passed through
1284 netfilter framework. Roll on, two-stage module
1285 delete... */
1286 synchronize_net();
1288 nf_ct_event_cache_flush();
1289 i_see_dead_people:
1290 nf_conntrack_flush();
1291 if (atomic_read(&nf_conntrack_count) != 0) {
1292 schedule();
1293 goto i_see_dead_people;
1295 /* wait until all references to nf_conntrack_untracked are dropped */
1296 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1297 schedule();
1299 rcu_assign_pointer(nf_ct_destroy, NULL);
1301 for (i = 0; i < NF_CT_F_NUM; i++) {
1302 if (nf_ct_cache[i].use == 0)
1303 continue;
1305 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1306 nf_ct_cache[i].use = 1;
1307 nf_conntrack_unregister_cache(i);
1309 kmem_cache_destroy(nf_conntrack_expect_cachep);
1310 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1311 nf_conntrack_htable_size);
1313 nf_conntrack_proto_fini();
1316 static struct list_head *alloc_hashtable(int *sizep, int *vmalloced)
1318 struct list_head *hash;
1319 unsigned int size, i;
1321 *vmalloced = 0;
1323 size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct list_head));
1324 hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN,
1325 get_order(sizeof(struct list_head)
1326 * size));
1327 if (!hash) {
1328 *vmalloced = 1;
1329 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1330 hash = vmalloc(sizeof(struct list_head) * size);
1333 if (hash)
1334 for (i = 0; i < size; i++)
1335 INIT_LIST_HEAD(&hash[i]);
1337 return hash;
1340 int set_hashsize(const char *val, struct kernel_param *kp)
1342 int i, bucket, hashsize, vmalloced;
1343 int old_vmalloced, old_size;
1344 int rnd;
1345 struct list_head *hash, *old_hash;
1346 struct nf_conntrack_tuple_hash *h;
1348 /* On boot, we can set this without any fancy locking. */
1349 if (!nf_conntrack_htable_size)
1350 return param_set_uint(val, kp);
1352 hashsize = simple_strtol(val, NULL, 0);
1353 if (!hashsize)
1354 return -EINVAL;
1356 hash = alloc_hashtable(&hashsize, &vmalloced);
1357 if (!hash)
1358 return -ENOMEM;
1360 /* We have to rehahs for the new table anyway, so we also can
1361 * use a newrandom seed */
1362 get_random_bytes(&rnd, 4);
1364 write_lock_bh(&nf_conntrack_lock);
1365 for (i = 0; i < nf_conntrack_htable_size; i++) {
1366 while (!list_empty(&nf_conntrack_hash[i])) {
1367 h = list_entry(nf_conntrack_hash[i].next,
1368 struct nf_conntrack_tuple_hash, list);
1369 list_del(&h->list);
1370 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1371 list_add_tail(&h->list, &hash[bucket]);
1374 old_size = nf_conntrack_htable_size;
1375 old_vmalloced = nf_conntrack_vmalloc;
1376 old_hash = nf_conntrack_hash;
1378 nf_conntrack_htable_size = hashsize;
1379 nf_conntrack_vmalloc = vmalloced;
1380 nf_conntrack_hash = hash;
1381 nf_conntrack_hash_rnd = rnd;
1382 write_unlock_bh(&nf_conntrack_lock);
1384 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1385 return 0;
1388 module_param_call(hashsize, set_hashsize, param_get_uint,
1389 &nf_conntrack_htable_size, 0600);
1391 s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
1392 enum ip_conntrack_dir dir,
1393 u32 seq);
1394 EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
1396 int __init nf_conntrack_init(void)
1398 int ret;
1400 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1401 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1402 if (!nf_conntrack_htable_size) {
1403 nf_conntrack_htable_size
1404 = (((num_physpages << PAGE_SHIFT) / 16384)
1405 / sizeof(struct list_head));
1406 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1407 nf_conntrack_htable_size = 8192;
1408 if (nf_conntrack_htable_size < 16)
1409 nf_conntrack_htable_size = 16;
1412 nf_conntrack_hash = alloc_hashtable(&nf_conntrack_htable_size,
1413 &nf_conntrack_vmalloc);
1414 if (!nf_conntrack_hash) {
1415 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1416 goto err_out;
1419 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1421 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1422 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1423 nf_conntrack_max);
1425 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1426 sizeof(struct nf_conn));
1427 if (ret < 0) {
1428 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1429 goto err_free_hash;
1432 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1433 sizeof(struct nf_conntrack_expect),
1434 0, 0, NULL, NULL);
1435 if (!nf_conntrack_expect_cachep) {
1436 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1437 goto err_free_conntrack_slab;
1440 ret = nf_conntrack_proto_init();
1441 if (ret < 0)
1442 goto out_free_expect_slab;
1444 /* For use by REJECT target */
1445 rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach);
1446 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
1448 /* Howto get NAT offsets */
1449 rcu_assign_pointer(nf_ct_nat_offset, NULL);
1451 /* Set up fake conntrack:
1452 - to never be deleted, not in any hashes */
1453 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1454 /* - and look it like as a confirmed connection */
1455 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1457 return ret;
1459 out_free_expect_slab:
1460 kmem_cache_destroy(nf_conntrack_expect_cachep);
1461 err_free_conntrack_slab:
1462 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1463 err_free_hash:
1464 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1465 nf_conntrack_htable_size);
1466 err_out:
1467 return -ENOMEM;