MOXA linux-2.6.x / linux-2.6.19-uc1 from UC-7110-LX-BOOTLOADER-1.9_VERSION-4.2.tgz
[linux-2.6.19-moxart.git] / net / ipv4 / netfilter / ip_conntrack_core.c
blobb954cb3781a8269206d1f36a63412c2a9211b46c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #define IP_CONNTRACK_VERSION "2.4"
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
59 DEFINE_RWLOCK(ip_conntrack_lock);
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash __read_mostly;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc __read_mostly;
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
86 /* deliver cached events and clear cache entry - must be called with locally
87 * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
91 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94 ecache->ct);
95 ecache->events = 0;
96 ip_conntrack_put(ecache->ct);
97 ecache->ct = NULL;
100 /* Deliver all cached events for a particular conntrack. This is called
101 * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
104 struct ip_conntrack_ecache *ecache;
106 local_bh_disable();
107 ecache = &__get_cpu_var(ip_conntrack_ecache);
108 if (ecache->ct == ct)
109 __ip_ct_deliver_cached_events(ecache);
110 local_bh_enable();
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
115 struct ip_conntrack_ecache *ecache;
117 /* take care of delivering potentially old events */
118 ecache = &__get_cpu_var(ip_conntrack_ecache);
119 BUG_ON(ecache->ct == ct);
120 if (ecache->ct)
121 __ip_ct_deliver_cached_events(ecache);
122 /* initialize for this conntrack/packet */
123 ecache->ct = ct;
124 nf_conntrack_get(&ct->ct_general);
127 /* flush the event cache - touches other CPU's data and must not be called while
128 * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
131 struct ip_conntrack_ecache *ecache;
132 int cpu;
134 for_each_possible_cpu(cpu) {
135 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136 if (ecache->ct)
137 ip_conntrack_put(ecache->ct);
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150 unsigned int size, unsigned int rnd)
152 return (jhash_3words((__force u32)tuple->src.ip,
153 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
154 (tuple->src.u.all | (tuple->dst.u.all << 16)),
155 rnd) % size);
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
161 return __hash_conntrack(tuple, ip_conntrack_htable_size,
162 ip_conntrack_hash_rnd);
166 ip_ct_get_tuple(const struct iphdr *iph,
167 const struct sk_buff *skb,
168 unsigned int dataoff,
169 struct ip_conntrack_tuple *tuple,
170 const struct ip_conntrack_protocol *protocol)
172 /* Never happen */
173 if (iph->frag_off & htons(IP_OFFSET)) {
174 printk("ip_conntrack_core: Frag of proto %u.\n",
175 iph->protocol);
176 return 0;
179 tuple->src.ip = iph->saddr;
180 tuple->dst.ip = iph->daddr;
181 tuple->dst.protonum = iph->protocol;
182 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
184 return protocol->pkt_to_tuple(skb, dataoff, tuple);
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189 const struct ip_conntrack_tuple *orig,
190 const struct ip_conntrack_protocol *protocol)
192 inverse->src.ip = orig->dst.ip;
193 inverse->dst.ip = orig->src.ip;
194 inverse->dst.protonum = orig->dst.protonum;
195 inverse->dst.dir = !orig->dst.dir;
197 return protocol->invert_tuple(inverse, orig);
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
204 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205 IP_NF_ASSERT(!timer_pending(&exp->timeout));
206 list_del(&exp->list);
207 CONNTRACK_STAT_INC(expect_delete);
208 exp->master->expecting--;
209 ip_conntrack_expect_put(exp);
212 static void expectation_timed_out(unsigned long ul_expect)
214 struct ip_conntrack_expect *exp = (void *)ul_expect;
216 write_lock_bh(&ip_conntrack_lock);
217 ip_ct_unlink_expect(exp);
218 write_unlock_bh(&ip_conntrack_lock);
219 ip_conntrack_expect_put(exp);
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
225 struct ip_conntrack_expect *i;
227 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
229 return i;
231 return NULL;
234 /* Just find a expectation corresponding to a tuple. */
235 struct ip_conntrack_expect *
236 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
238 struct ip_conntrack_expect *i;
240 read_lock_bh(&ip_conntrack_lock);
241 i = __ip_conntrack_expect_find(tuple);
242 if (i)
243 atomic_inc(&i->use);
244 read_unlock_bh(&ip_conntrack_lock);
246 return i;
249 /* If an expectation for this connection is found, it gets delete from
250 * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
254 struct ip_conntrack_expect *i;
256 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257 /* If master is not in hash table yet (ie. packet hasn't left
258 this machine yet), how can other end know about expected?
259 Hence these are not the droids you are looking for (if
260 master ct never got confirmed, we'd hold a reference to it
261 and weird things would happen to future packets). */
262 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263 && is_confirmed(i->master)) {
264 if (i->flags & IP_CT_EXPECT_PERMANENT) {
265 atomic_inc(&i->use);
266 return i;
267 } else if (del_timer(&i->timeout)) {
268 ip_ct_unlink_expect(i);
269 return i;
273 return NULL;
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
279 struct ip_conntrack_expect *i, *tmp;
281 /* Optimization: most connection never expect any others. */
282 if (ct->expecting == 0)
283 return;
285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286 if (i->master == ct && del_timer(&i->timeout)) {
287 ip_ct_unlink_expect(i);
288 ip_conntrack_expect_put(i);
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
296 DEBUGP("clean_from_lists(%p)\n", ct);
297 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
301 /* Destroy all pending expectations */
302 ip_ct_remove_expectations(ct);
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
308 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309 struct ip_conntrack_protocol *proto;
310 struct ip_conntrack_helper *helper;
312 DEBUGP("destroy_conntrack(%p)\n", ct);
313 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314 IP_NF_ASSERT(!timer_pending(&ct->timeout));
316 ip_conntrack_event(IPCT_DESTROY, ct);
317 set_bit(IPS_DYING_BIT, &ct->status);
319 helper = ct->helper;
320 if (helper && helper->destroy)
321 helper->destroy(ct);
323 /* To make sure we don't get any weird locking issues here:
324 * destroy_conntrack() MUST NOT be called with a write lock
325 * to ip_conntrack_lock!!! -HW */
326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327 if (proto && proto->destroy)
328 proto->destroy(ct);
330 if (ip_conntrack_destroyed)
331 ip_conntrack_destroyed(ct);
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
340 #if defined(CONFIG_IP_NF_MATCH_LAYER7) || defined(CONFIG_IP_NF_MATCH_LAYER7_MODULE)
341 if(ct->layer7.app_proto)
342 kfree(ct->layer7.app_proto);
343 if(ct->layer7.app_data)
344 kfree(ct->layer7.app_data);
345 #endif
347 /* We overload first tuple to link into unconfirmed list. */
348 if (!is_confirmed(ct)) {
349 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
350 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
353 CONNTRACK_STAT_INC(delete);
354 write_unlock_bh(&ip_conntrack_lock);
356 if (ct->master)
357 ip_conntrack_put(ct->master);
359 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
360 ip_conntrack_free(ct);
363 static void death_by_timeout(unsigned long ul_conntrack)
365 struct ip_conntrack *ct = (void *)ul_conntrack;
367 write_lock_bh(&ip_conntrack_lock);
368 /* Inside lock so preempt is disabled on module removal path.
369 * Otherwise we can get spurious warnings. */
370 CONNTRACK_STAT_INC(delete_list);
371 clean_from_lists(ct);
372 write_unlock_bh(&ip_conntrack_lock);
373 ip_conntrack_put(ct);
376 struct ip_conntrack_tuple_hash *
377 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
378 const struct ip_conntrack *ignored_conntrack)
380 struct ip_conntrack_tuple_hash *h;
381 unsigned int hash = hash_conntrack(tuple);
383 ASSERT_READ_LOCK(&ip_conntrack_lock);
384 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
385 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
386 ip_ct_tuple_equal(tuple, &h->tuple)) {
387 CONNTRACK_STAT_INC(found);
388 return h;
390 CONNTRACK_STAT_INC(searched);
393 return NULL;
396 /* Find a connection corresponding to a tuple. */
397 struct ip_conntrack_tuple_hash *
398 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
399 const struct ip_conntrack *ignored_conntrack)
401 struct ip_conntrack_tuple_hash *h;
403 read_lock_bh(&ip_conntrack_lock);
404 h = __ip_conntrack_find(tuple, ignored_conntrack);
405 if (h)
406 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
407 read_unlock_bh(&ip_conntrack_lock);
409 return h;
412 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
413 unsigned int hash,
414 unsigned int repl_hash)
416 ct->id = ++ip_conntrack_next_id;
417 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
418 &ip_conntrack_hash[hash]);
419 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
420 &ip_conntrack_hash[repl_hash]);
423 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
425 unsigned int hash, repl_hash;
427 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
428 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
430 write_lock_bh(&ip_conntrack_lock);
431 __ip_conntrack_hash_insert(ct, hash, repl_hash);
432 write_unlock_bh(&ip_conntrack_lock);
435 /* Confirm a connection given skb; places it in hash table */
437 __ip_conntrack_confirm(struct sk_buff **pskb)
439 unsigned int hash, repl_hash;
440 struct ip_conntrack_tuple_hash *h;
441 struct ip_conntrack *ct;
442 enum ip_conntrack_info ctinfo;
444 ct = ip_conntrack_get(*pskb, &ctinfo);
446 /* ipt_REJECT uses ip_conntrack_attach to attach related
447 ICMP/TCP RST packets in other direction. Actual packet
448 which created connection will be IP_CT_NEW or for an
449 expected connection, IP_CT_RELATED. */
450 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
451 return NF_ACCEPT;
453 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
454 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
456 /* We're not in hash table, and we refuse to set up related
457 connections for unconfirmed conns. But packet copies and
458 REJECT will give spurious warnings here. */
459 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
461 /* No external references means noone else could have
462 confirmed us. */
463 IP_NF_ASSERT(!is_confirmed(ct));
464 DEBUGP("Confirming conntrack %p\n", ct);
466 write_lock_bh(&ip_conntrack_lock);
468 /* See if there's one in the list already, including reverse:
469 NAT could have grabbed it without realizing, since we're
470 not in the hash. If there is, we lost race. */
471 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
472 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
473 &h->tuple))
474 goto out;
475 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
476 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
477 &h->tuple))
478 goto out;
480 /* Remove from unconfirmed list */
481 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
483 __ip_conntrack_hash_insert(ct, hash, repl_hash);
484 /* Timer relative to confirmation time, not original
485 setting time, otherwise we'd get timer wrap in
486 weird delay cases. */
487 ct->timeout.expires += jiffies;
488 add_timer(&ct->timeout);
489 atomic_inc(&ct->ct_general.use);
490 set_bit(IPS_CONFIRMED_BIT, &ct->status);
491 CONNTRACK_STAT_INC(insert);
492 write_unlock_bh(&ip_conntrack_lock);
493 if (ct->helper)
494 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499 #endif
500 ip_conntrack_event_cache(master_ct(ct) ?
501 IPCT_RELATED : IPCT_NEW, *pskb);
503 return NF_ACCEPT;
505 out:
506 CONNTRACK_STAT_INC(insert_failed);
507 write_unlock_bh(&ip_conntrack_lock);
508 return NF_DROP;
511 /* Returns true if a connection correspondings to the tuple (required
512 for NAT). */
514 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
515 const struct ip_conntrack *ignored_conntrack)
517 struct ip_conntrack_tuple_hash *h;
519 read_lock_bh(&ip_conntrack_lock);
520 h = __ip_conntrack_find(tuple, ignored_conntrack);
521 read_unlock_bh(&ip_conntrack_lock);
523 return h != NULL;
526 /* There's a small race here where we may free a just-assured
527 connection. Too bad: we're in trouble anyway. */
528 static int early_drop(struct list_head *chain)
530 /* Traverse backwards: gives us oldest, which is roughly LRU */
531 struct ip_conntrack_tuple_hash *h;
532 struct ip_conntrack *ct = NULL, *tmp;
533 int dropped = 0;
535 read_lock_bh(&ip_conntrack_lock);
536 list_for_each_entry_reverse(h, chain, list) {
537 tmp = tuplehash_to_ctrack(h);
538 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
539 ct = tmp;
540 atomic_inc(&ct->ct_general.use);
541 break;
544 read_unlock_bh(&ip_conntrack_lock);
546 if (!ct)
547 return dropped;
549 if (del_timer(&ct->timeout)) {
550 death_by_timeout((unsigned long)ct);
551 dropped = 1;
552 CONNTRACK_STAT_INC(early_drop);
554 ip_conntrack_put(ct);
555 return dropped;
558 static struct ip_conntrack_helper *
559 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
561 struct ip_conntrack_helper *h;
563 list_for_each_entry(h, &helpers, list) {
564 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
565 return h;
567 return NULL;
570 struct ip_conntrack_helper *
571 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
573 struct ip_conntrack_helper *helper;
575 /* need ip_conntrack_lock to assure that helper exists until
576 * try_module_get() is called */
577 read_lock_bh(&ip_conntrack_lock);
579 helper = __ip_conntrack_helper_find(tuple);
580 if (helper) {
581 /* need to increase module usage count to assure helper will
582 * not go away while the caller is e.g. busy putting a
583 * conntrack in the hash that uses the helper */
584 if (!try_module_get(helper->me))
585 helper = NULL;
588 read_unlock_bh(&ip_conntrack_lock);
590 return helper;
593 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
595 module_put(helper->me);
598 struct ip_conntrack_protocol *
599 __ip_conntrack_proto_find(u_int8_t protocol)
601 return ip_ct_protos[protocol];
604 /* this is guaranteed to always return a valid protocol helper, since
605 * it falls back to generic_protocol */
606 struct ip_conntrack_protocol *
607 ip_conntrack_proto_find_get(u_int8_t protocol)
609 struct ip_conntrack_protocol *p;
611 preempt_disable();
612 p = __ip_conntrack_proto_find(protocol);
613 if (p) {
614 if (!try_module_get(p->me))
615 p = &ip_conntrack_generic_protocol;
617 preempt_enable();
619 return p;
622 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
624 module_put(p->me);
627 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
628 struct ip_conntrack_tuple *repl)
630 struct ip_conntrack *conntrack;
632 if (!ip_conntrack_hash_rnd_initted) {
633 get_random_bytes(&ip_conntrack_hash_rnd, 4);
634 ip_conntrack_hash_rnd_initted = 1;
637 /* We don't want any race condition at early drop stage */
638 atomic_inc(&ip_conntrack_count);
640 if (ip_conntrack_max
641 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
642 unsigned int hash = hash_conntrack(orig);
643 /* Try dropping from this hash chain. */
644 if (!early_drop(&ip_conntrack_hash[hash])) {
645 atomic_dec(&ip_conntrack_count);
646 if (net_ratelimit())
647 printk(KERN_WARNING
648 "ip_conntrack: table full, dropping"
649 " packet.\n");
650 return ERR_PTR(-ENOMEM);
654 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
655 if (!conntrack) {
656 DEBUGP("Can't allocate conntrack.\n");
657 atomic_dec(&ip_conntrack_count);
658 return ERR_PTR(-ENOMEM);
661 memset(conntrack, 0, sizeof(*conntrack));
662 atomic_set(&conntrack->ct_general.use, 1);
663 conntrack->ct_general.destroy = destroy_conntrack;
664 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
665 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
666 /* Don't set timer yet: wait for confirmation */
667 init_timer(&conntrack->timeout);
668 conntrack->timeout.data = (unsigned long)conntrack;
669 conntrack->timeout.function = death_by_timeout;
671 return conntrack;
674 void
675 ip_conntrack_free(struct ip_conntrack *conntrack)
677 atomic_dec(&ip_conntrack_count);
678 kmem_cache_free(ip_conntrack_cachep, conntrack);
681 /* Allocate a new conntrack: we return -ENOMEM if classification
682 * failed due to stress. Otherwise it really is unclassifiable */
683 static struct ip_conntrack_tuple_hash *
684 init_conntrack(struct ip_conntrack_tuple *tuple,
685 struct ip_conntrack_protocol *protocol,
686 struct sk_buff *skb)
688 struct ip_conntrack *conntrack;
689 struct ip_conntrack_tuple repl_tuple;
690 struct ip_conntrack_expect *exp;
692 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
693 DEBUGP("Can't invert tuple.\n");
694 return NULL;
697 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
698 if (conntrack == NULL || IS_ERR(conntrack))
699 return (struct ip_conntrack_tuple_hash *)conntrack;
701 if (!protocol->new(conntrack, skb)) {
702 ip_conntrack_free(conntrack);
703 return NULL;
706 write_lock_bh(&ip_conntrack_lock);
707 exp = find_expectation(tuple);
709 if (exp) {
710 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
711 conntrack, exp);
712 /* Welcome, Mr. Bond. We've been expecting you... */
713 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
714 conntrack->master = exp->master;
715 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
716 conntrack->mark = exp->master->mark;
717 #endif
718 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720 /* this is ugly, but there is no other place where to put it */
721 conntrack->nat.masq_index = exp->master->nat.masq_index;
722 #endif
723 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
724 conntrack->secmark = exp->master->secmark;
725 #endif
726 nf_conntrack_get(&conntrack->master->ct_general);
727 CONNTRACK_STAT_INC(expect_new);
728 } else {
729 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
731 CONNTRACK_STAT_INC(new);
734 /* Overload tuple linked list to put us in unconfirmed list. */
735 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
737 write_unlock_bh(&ip_conntrack_lock);
739 if (exp) {
740 if (exp->expectfn)
741 exp->expectfn(conntrack, exp);
742 ip_conntrack_expect_put(exp);
745 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
748 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
749 static inline struct ip_conntrack *
750 resolve_normal_ct(struct sk_buff *skb,
751 struct ip_conntrack_protocol *proto,
752 int *set_reply,
753 unsigned int hooknum,
754 enum ip_conntrack_info *ctinfo)
756 struct ip_conntrack_tuple tuple;
757 struct ip_conntrack_tuple_hash *h;
758 struct ip_conntrack *ct;
760 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
762 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
763 &tuple,proto))
764 return NULL;
766 /* look for tuple match */
767 h = ip_conntrack_find_get(&tuple, NULL);
768 if (!h) {
769 h = init_conntrack(&tuple, proto, skb);
770 if (!h)
771 return NULL;
772 if (IS_ERR(h))
773 return (void *)h;
775 ct = tuplehash_to_ctrack(h);
777 /* It exists; we have (non-exclusive) reference. */
778 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
779 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
780 /* Please set reply bit if this packet OK */
781 *set_reply = 1;
782 } else {
783 /* Once we've had two way comms, always ESTABLISHED. */
784 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
785 DEBUGP("ip_conntrack_in: normal packet for %p\n",
786 ct);
787 *ctinfo = IP_CT_ESTABLISHED;
788 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
789 DEBUGP("ip_conntrack_in: related packet for %p\n",
790 ct);
791 *ctinfo = IP_CT_RELATED;
792 } else {
793 DEBUGP("ip_conntrack_in: new packet for %p\n",
794 ct);
795 *ctinfo = IP_CT_NEW;
797 *set_reply = 0;
799 skb->nfct = &ct->ct_general;
800 skb->nfctinfo = *ctinfo;
801 return ct;
804 /* Netfilter hook itself. */
805 unsigned int ip_conntrack_in(unsigned int hooknum,
806 struct sk_buff **pskb,
807 const struct net_device *in,
808 const struct net_device *out,
809 int (*okfn)(struct sk_buff *))
811 struct ip_conntrack *ct;
812 enum ip_conntrack_info ctinfo;
813 struct ip_conntrack_protocol *proto;
814 int set_reply = 0;
815 int ret;
817 /* Previously seen (loopback or untracked)? Ignore. */
818 if ((*pskb)->nfct) {
819 CONNTRACK_STAT_INC(ignore);
820 return NF_ACCEPT;
823 /* Never happen */
824 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
825 if (net_ratelimit()) {
826 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
827 (*pskb)->nh.iph->protocol, hooknum);
829 return NF_DROP;
832 /* Doesn't cover locally-generated broadcast, so not worth it. */
833 #if 0
834 /* Ignore broadcast: no `connection'. */
835 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
836 printk("Broadcast packet!\n");
837 return NF_ACCEPT;
838 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
839 == htonl(0x000000FF)) {
840 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
841 NIPQUAD((*pskb)->nh.iph->saddr),
842 NIPQUAD((*pskb)->nh.iph->daddr),
843 (*pskb)->sk, (*pskb)->pkt_type);
845 #endif
847 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
849 /* It may be an special packet, error, unclean...
850 * inverse of the return code tells to the netfilter
851 * core what to do with the packet. */
852 if (proto->error != NULL
853 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
854 CONNTRACK_STAT_INC(error);
855 CONNTRACK_STAT_INC(invalid);
856 return -ret;
859 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
860 /* Not valid part of a connection */
861 CONNTRACK_STAT_INC(invalid);
862 return NF_ACCEPT;
865 if (IS_ERR(ct)) {
866 /* Too stressed to deal. */
867 CONNTRACK_STAT_INC(drop);
868 return NF_DROP;
871 IP_NF_ASSERT((*pskb)->nfct);
873 ret = proto->packet(ct, *pskb, ctinfo);
874 if (ret < 0) {
875 /* Invalid: inverse of the return code tells
876 * the netfilter core what to do*/
877 nf_conntrack_put((*pskb)->nfct);
878 (*pskb)->nfct = NULL;
879 CONNTRACK_STAT_INC(invalid);
880 return -ret;
883 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
884 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
886 return ret;
889 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
890 const struct ip_conntrack_tuple *orig)
892 return ip_ct_invert_tuple(inverse, orig,
893 __ip_conntrack_proto_find(orig->dst.protonum));
896 /* Would two expected things clash? */
897 static inline int expect_clash(const struct ip_conntrack_expect *a,
898 const struct ip_conntrack_expect *b)
900 /* Part covered by intersection of masks must be unequal,
901 otherwise they clash */
902 struct ip_conntrack_tuple intersect_mask
903 = { { a->mask.src.ip & b->mask.src.ip,
904 { a->mask.src.u.all & b->mask.src.u.all } },
905 { a->mask.dst.ip & b->mask.dst.ip,
906 { a->mask.dst.u.all & b->mask.dst.u.all },
907 a->mask.dst.protonum & b->mask.dst.protonum } };
909 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
912 static inline int expect_matches(const struct ip_conntrack_expect *a,
913 const struct ip_conntrack_expect *b)
915 return a->master == b->master
916 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
917 && ip_ct_tuple_equal(&a->mask, &b->mask);
920 /* Generally a bad idea to call this: could have matched already. */
921 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
923 struct ip_conntrack_expect *i;
925 write_lock_bh(&ip_conntrack_lock);
926 /* choose the the oldest expectation to evict */
927 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
928 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
929 ip_ct_unlink_expect(i);
930 write_unlock_bh(&ip_conntrack_lock);
931 ip_conntrack_expect_put(i);
932 return;
935 write_unlock_bh(&ip_conntrack_lock);
938 /* We don't increase the master conntrack refcount for non-fulfilled
939 * conntracks. During the conntrack destruction, the expectations are
940 * always killed before the conntrack itself */
941 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
943 struct ip_conntrack_expect *new;
945 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
946 if (!new) {
947 DEBUGP("expect_related: OOM allocating expect\n");
948 return NULL;
950 new->master = me;
951 atomic_set(&new->use, 1);
952 return new;
955 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
957 if (atomic_dec_and_test(&exp->use))
958 kmem_cache_free(ip_conntrack_expect_cachep, exp);
961 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
963 atomic_inc(&exp->use);
964 exp->master->expecting++;
965 list_add(&exp->list, &ip_conntrack_expect_list);
967 init_timer(&exp->timeout);
968 exp->timeout.data = (unsigned long)exp;
969 exp->timeout.function = expectation_timed_out;
970 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
971 add_timer(&exp->timeout);
973 exp->id = ++ip_conntrack_expect_next_id;
974 atomic_inc(&exp->use);
975 CONNTRACK_STAT_INC(expect_create);
978 /* Race with expectations being used means we could have none to find; OK. */
979 static void evict_oldest_expect(struct ip_conntrack *master)
981 struct ip_conntrack_expect *i;
983 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
984 if (i->master == master) {
985 if (del_timer(&i->timeout)) {
986 ip_ct_unlink_expect(i);
987 ip_conntrack_expect_put(i);
989 break;
994 static inline int refresh_timer(struct ip_conntrack_expect *i)
996 if (!del_timer(&i->timeout))
997 return 0;
999 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1000 add_timer(&i->timeout);
1001 return 1;
1004 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1006 struct ip_conntrack_expect *i;
1007 int ret;
1009 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1010 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1011 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1013 write_lock_bh(&ip_conntrack_lock);
1014 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1015 if (expect_matches(i, expect)) {
1016 /* Refresh timer: if it's dying, ignore.. */
1017 if (refresh_timer(i)) {
1018 ret = 0;
1019 goto out;
1021 } else if (expect_clash(i, expect)) {
1022 ret = -EBUSY;
1023 goto out;
1027 /* Will be over limit? */
1028 if (expect->master->helper->max_expected &&
1029 expect->master->expecting >= expect->master->helper->max_expected)
1030 evict_oldest_expect(expect->master);
1032 ip_conntrack_expect_insert(expect);
1033 ip_conntrack_expect_event(IPEXP_NEW, expect);
1034 ret = 0;
1035 out:
1036 write_unlock_bh(&ip_conntrack_lock);
1037 return ret;
1040 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1041 implicitly racy: see __ip_conntrack_confirm */
1042 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1043 const struct ip_conntrack_tuple *newreply)
1045 write_lock_bh(&ip_conntrack_lock);
1046 /* Should be unconfirmed, so not in hash table yet */
1047 IP_NF_ASSERT(!is_confirmed(conntrack));
1049 DEBUGP("Altering reply tuple of %p to ", conntrack);
1050 DUMP_TUPLE(newreply);
1052 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1053 if (!conntrack->master && conntrack->expecting == 0)
1054 conntrack->helper = __ip_conntrack_helper_find(newreply);
1055 write_unlock_bh(&ip_conntrack_lock);
1058 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1060 BUG_ON(me->timeout == 0);
1061 write_lock_bh(&ip_conntrack_lock);
1062 list_add(&me->list, &helpers);
1063 write_unlock_bh(&ip_conntrack_lock);
1065 return 0;
1068 struct ip_conntrack_helper *
1069 __ip_conntrack_helper_find_byname(const char *name)
1071 struct ip_conntrack_helper *h;
1073 list_for_each_entry(h, &helpers, list) {
1074 if (!strcmp(h->name, name))
1075 return h;
1078 return NULL;
1081 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1082 const struct ip_conntrack_helper *me)
1084 if (tuplehash_to_ctrack(i)->helper == me) {
1085 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1086 tuplehash_to_ctrack(i)->helper = NULL;
1090 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1092 unsigned int i;
1093 struct ip_conntrack_tuple_hash *h;
1094 struct ip_conntrack_expect *exp, *tmp;
1096 /* Need write lock here, to delete helper. */
1097 write_lock_bh(&ip_conntrack_lock);
1098 list_del(&me->list);
1100 /* Get rid of expectations */
1101 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1102 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1103 ip_ct_unlink_expect(exp);
1104 ip_conntrack_expect_put(exp);
1107 /* Get rid of expecteds, set helpers to NULL. */
1108 list_for_each_entry(h, &unconfirmed, list)
1109 unhelp(h, me);
1110 for (i = 0; i < ip_conntrack_htable_size; i++) {
1111 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1112 unhelp(h, me);
1114 write_unlock_bh(&ip_conntrack_lock);
1116 /* Someone could be still looking at the helper in a bh. */
1117 synchronize_net();
1120 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1121 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1122 enum ip_conntrack_info ctinfo,
1123 const struct sk_buff *skb,
1124 unsigned long extra_jiffies,
1125 int do_acct)
1127 int event = 0;
1129 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1130 IP_NF_ASSERT(skb);
1132 write_lock_bh(&ip_conntrack_lock);
1134 /* Only update if this is not a fixed timeout */
1135 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1136 write_unlock_bh(&ip_conntrack_lock);
1137 return;
1140 /* If not in hash table, timer will not be active yet */
1141 if (!is_confirmed(ct)) {
1142 ct->timeout.expires = extra_jiffies;
1143 event = IPCT_REFRESH;
1144 } else {
1145 /* Need del_timer for race avoidance (may already be dying). */
1146 if (del_timer(&ct->timeout)) {
1147 ct->timeout.expires = jiffies + extra_jiffies;
1148 add_timer(&ct->timeout);
1149 event = IPCT_REFRESH;
1153 #ifdef CONFIG_IP_NF_CT_ACCT
1154 if (do_acct) {
1155 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1156 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1157 ntohs(skb->nh.iph->tot_len);
1158 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1159 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1160 event |= IPCT_COUNTER_FILLING;
1162 #endif
1164 write_unlock_bh(&ip_conntrack_lock);
1166 /* must be unlocked when calling event cache */
1167 if (event)
1168 ip_conntrack_event_cache(event, skb);
1171 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1172 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1173 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1174 * in ip_conntrack_core, since we don't want the protocols to autoload
1175 * or depend on ctnetlink */
1176 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1177 const struct ip_conntrack_tuple *tuple)
1179 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1180 &tuple->src.u.tcp.port);
1181 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1182 &tuple->dst.u.tcp.port);
1183 return 0;
1185 nfattr_failure:
1186 return -1;
1189 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1190 struct ip_conntrack_tuple *t)
1192 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1193 return -EINVAL;
1195 t->src.u.tcp.port =
1196 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1197 t->dst.u.tcp.port =
1198 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1200 return 0;
1202 #endif
1204 /* Returns new sk_buff, or NULL */
1205 struct sk_buff *
1206 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1208 skb_orphan(skb);
1210 local_bh_disable();
1211 skb = ip_defrag(skb, user);
1212 local_bh_enable();
1214 if (skb)
1215 ip_send_check(skb->nh.iph);
1216 return skb;
1219 /* Used by ipt_REJECT. */
1220 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1222 struct ip_conntrack *ct;
1223 enum ip_conntrack_info ctinfo;
1225 /* This ICMP is in reverse direction to the packet which caused it */
1226 ct = ip_conntrack_get(skb, &ctinfo);
1228 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1229 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1230 else
1231 ctinfo = IP_CT_RELATED;
1233 /* Attach to new skbuff, and increment count */
1234 nskb->nfct = &ct->ct_general;
1235 nskb->nfctinfo = ctinfo;
1236 nf_conntrack_get(nskb->nfct);
1239 /* Bring out ya dead! */
1240 static struct ip_conntrack *
1241 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1242 void *data, unsigned int *bucket)
1244 struct ip_conntrack_tuple_hash *h;
1245 struct ip_conntrack *ct;
1247 write_lock_bh(&ip_conntrack_lock);
1248 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1249 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1250 ct = tuplehash_to_ctrack(h);
1251 if (iter(ct, data))
1252 goto found;
1255 list_for_each_entry(h, &unconfirmed, list) {
1256 ct = tuplehash_to_ctrack(h);
1257 if (iter(ct, data))
1258 goto found;
1260 write_unlock_bh(&ip_conntrack_lock);
1261 return NULL;
1263 found:
1264 atomic_inc(&ct->ct_general.use);
1265 write_unlock_bh(&ip_conntrack_lock);
1266 return ct;
1269 void
1270 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1272 struct ip_conntrack *ct;
1273 unsigned int bucket = 0;
1275 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1276 /* Time to push up daises... */
1277 if (del_timer(&ct->timeout))
1278 death_by_timeout((unsigned long)ct);
1279 /* ... else the timer will get him soon. */
1281 ip_conntrack_put(ct);
1285 /* Fast function for those who don't want to parse /proc (and I don't
1286 blame them). */
1287 /* Reversing the socket's dst/src point of view gives us the reply
1288 mapping. */
1289 static int
1290 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1292 struct inet_sock *inet = inet_sk(sk);
1293 struct ip_conntrack_tuple_hash *h;
1294 struct ip_conntrack_tuple tuple;
1296 IP_CT_TUPLE_U_BLANK(&tuple);
1297 tuple.src.ip = inet->rcv_saddr;
1298 tuple.src.u.tcp.port = inet->sport;
1299 tuple.dst.ip = inet->daddr;
1300 tuple.dst.u.tcp.port = inet->dport;
1301 tuple.dst.protonum = IPPROTO_TCP;
1303 /* We only do TCP at the moment: is there a better way? */
1304 if (strcmp(sk->sk_prot->name, "TCP")) {
1305 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1306 return -ENOPROTOOPT;
1309 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1310 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1311 *len, sizeof(struct sockaddr_in));
1312 return -EINVAL;
1315 h = ip_conntrack_find_get(&tuple, NULL);
1316 if (h) {
1317 struct sockaddr_in sin;
1318 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1320 sin.sin_family = AF_INET;
1321 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1322 .tuple.dst.u.tcp.port;
1323 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1324 .tuple.dst.ip;
1325 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1327 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1328 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1329 ip_conntrack_put(ct);
1330 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1331 return -EFAULT;
1332 else
1333 return 0;
1335 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1336 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1337 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1338 return -ENOENT;
1341 static struct nf_sockopt_ops so_getorigdst = {
1342 .pf = PF_INET,
1343 .get_optmin = SO_ORIGINAL_DST,
1344 .get_optmax = SO_ORIGINAL_DST+1,
1345 .get = &getorigdst,
1348 static int kill_all(struct ip_conntrack *i, void *data)
1350 return 1;
1353 void ip_conntrack_flush(void)
1355 ip_ct_iterate_cleanup(kill_all, NULL);
1358 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1360 if (vmalloced)
1361 vfree(hash);
1362 else
1363 free_pages((unsigned long)hash,
1364 get_order(sizeof(struct list_head) * size));
1367 /* Mishearing the voices in his head, our hero wonders how he's
1368 supposed to kill the mall. */
1369 void ip_conntrack_cleanup(void)
1371 ip_ct_attach = NULL;
1373 /* This makes sure all current packets have passed through
1374 netfilter framework. Roll on, two-stage module
1375 delete... */
1376 synchronize_net();
1378 ip_ct_event_cache_flush();
1379 i_see_dead_people:
1380 ip_conntrack_flush();
1381 if (atomic_read(&ip_conntrack_count) != 0) {
1382 schedule();
1383 goto i_see_dead_people;
1385 /* wait until all references to ip_conntrack_untracked are dropped */
1386 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1387 schedule();
1389 kmem_cache_destroy(ip_conntrack_cachep);
1390 kmem_cache_destroy(ip_conntrack_expect_cachep);
1391 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1392 ip_conntrack_htable_size);
1393 nf_unregister_sockopt(&so_getorigdst);
1396 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1398 struct list_head *hash;
1399 unsigned int i;
1401 *vmalloced = 0;
1402 hash = (void*)__get_free_pages(GFP_KERNEL,
1403 get_order(sizeof(struct list_head)
1404 * size));
1405 if (!hash) {
1406 *vmalloced = 1;
1407 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1408 hash = vmalloc(sizeof(struct list_head) * size);
1411 if (hash)
1412 for (i = 0; i < size; i++)
1413 INIT_LIST_HEAD(&hash[i]);
1415 return hash;
1418 static int set_hashsize(const char *val, struct kernel_param *kp)
1420 int i, bucket, hashsize, vmalloced;
1421 int old_vmalloced, old_size;
1422 int rnd;
1423 struct list_head *hash, *old_hash;
1424 struct ip_conntrack_tuple_hash *h;
1426 /* On boot, we can set this without any fancy locking. */
1427 if (!ip_conntrack_htable_size)
1428 return param_set_int(val, kp);
1430 hashsize = simple_strtol(val, NULL, 0);
1431 if (!hashsize)
1432 return -EINVAL;
1434 hash = alloc_hashtable(hashsize, &vmalloced);
1435 if (!hash)
1436 return -ENOMEM;
1438 /* We have to rehash for the new table anyway, so we also can
1439 * use a new random seed */
1440 get_random_bytes(&rnd, 4);
1442 write_lock_bh(&ip_conntrack_lock);
1443 for (i = 0; i < ip_conntrack_htable_size; i++) {
1444 while (!list_empty(&ip_conntrack_hash[i])) {
1445 h = list_entry(ip_conntrack_hash[i].next,
1446 struct ip_conntrack_tuple_hash, list);
1447 list_del(&h->list);
1448 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1449 list_add_tail(&h->list, &hash[bucket]);
1452 old_size = ip_conntrack_htable_size;
1453 old_vmalloced = ip_conntrack_vmalloc;
1454 old_hash = ip_conntrack_hash;
1456 ip_conntrack_htable_size = hashsize;
1457 ip_conntrack_vmalloc = vmalloced;
1458 ip_conntrack_hash = hash;
1459 ip_conntrack_hash_rnd = rnd;
1460 write_unlock_bh(&ip_conntrack_lock);
1462 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1463 return 0;
1466 module_param_call(hashsize, set_hashsize, param_get_uint,
1467 &ip_conntrack_htable_size, 0600);
1469 int __init ip_conntrack_init(void)
1471 unsigned int i;
1472 int ret;
1474 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1475 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1476 if (!ip_conntrack_htable_size) {
1477 ip_conntrack_htable_size
1478 = (((num_physpages << PAGE_SHIFT) / 16384)
1479 / sizeof(struct list_head));
1480 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1481 ip_conntrack_htable_size = 8192;
1482 if (ip_conntrack_htable_size < 16)
1483 ip_conntrack_htable_size = 16;
1485 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1487 printk("ip_conntrack version %s (%u buckets, %d max)"
1488 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1489 ip_conntrack_htable_size, ip_conntrack_max,
1490 sizeof(struct ip_conntrack));
1492 ret = nf_register_sockopt(&so_getorigdst);
1493 if (ret != 0) {
1494 printk(KERN_ERR "Unable to register netfilter socket option\n");
1495 return ret;
1498 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1499 &ip_conntrack_vmalloc);
1500 if (!ip_conntrack_hash) {
1501 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1502 goto err_unreg_sockopt;
1505 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1506 sizeof(struct ip_conntrack), 0,
1507 0, NULL, NULL);
1508 if (!ip_conntrack_cachep) {
1509 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1510 goto err_free_hash;
1513 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1514 sizeof(struct ip_conntrack_expect),
1515 0, 0, NULL, NULL);
1516 if (!ip_conntrack_expect_cachep) {
1517 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1518 goto err_free_conntrack_slab;
1521 /* Don't NEED lock here, but good form anyway. */
1522 write_lock_bh(&ip_conntrack_lock);
1523 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1524 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1525 /* Sew in builtin protocols. */
1526 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1527 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1528 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1529 write_unlock_bh(&ip_conntrack_lock);
1531 /* For use by ipt_REJECT */
1532 ip_ct_attach = ip_conntrack_attach;
1534 /* Set up fake conntrack:
1535 - to never be deleted, not in any hashes */
1536 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1537 /* - and look it like as a confirmed connection */
1538 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1540 return ret;
1542 err_free_conntrack_slab:
1543 kmem_cache_destroy(ip_conntrack_cachep);
1544 err_free_hash:
1545 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1546 ip_conntrack_htable_size);
1547 err_unreg_sockopt:
1548 nf_unregister_sockopt(&so_getorigdst);
1550 return -ENOMEM;