[NETFILTER]: conntrack annotations
[linux-2.6/btrfs-unstable.git] / net / ipv4 / netfilter / ip_conntrack_core.c
blob143c4668538b40203063e2c6c741b395a0bf3fef
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #define IP_CONNTRACK_VERSION "2.4"
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
59 DEFINE_RWLOCK(ip_conntrack_lock);
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash __read_mostly;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc __read_mostly;
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
86 /* deliver cached events and clear cache entry - must be called with locally
87 * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
91 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94 ecache->ct);
95 ecache->events = 0;
96 ip_conntrack_put(ecache->ct);
97 ecache->ct = NULL;
100 /* Deliver all cached events for a particular conntrack. This is called
101 * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
104 struct ip_conntrack_ecache *ecache;
106 local_bh_disable();
107 ecache = &__get_cpu_var(ip_conntrack_ecache);
108 if (ecache->ct == ct)
109 __ip_ct_deliver_cached_events(ecache);
110 local_bh_enable();
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
115 struct ip_conntrack_ecache *ecache;
117 /* take care of delivering potentially old events */
118 ecache = &__get_cpu_var(ip_conntrack_ecache);
119 BUG_ON(ecache->ct == ct);
120 if (ecache->ct)
121 __ip_ct_deliver_cached_events(ecache);
122 /* initialize for this conntrack/packet */
123 ecache->ct = ct;
124 nf_conntrack_get(&ct->ct_general);
127 /* flush the event cache - touches other CPU's data and must not be called while
128 * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
131 struct ip_conntrack_ecache *ecache;
132 int cpu;
134 for_each_possible_cpu(cpu) {
135 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136 if (ecache->ct)
137 ip_conntrack_put(ecache->ct);
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150 unsigned int size, unsigned int rnd)
152 return (jhash_3words((__force u32)tuple->src.ip,
153 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
154 (tuple->src.u.all | (tuple->dst.u.all << 16)),
155 rnd) % size);
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
161 return __hash_conntrack(tuple, ip_conntrack_htable_size,
162 ip_conntrack_hash_rnd);
166 ip_ct_get_tuple(const struct iphdr *iph,
167 const struct sk_buff *skb,
168 unsigned int dataoff,
169 struct ip_conntrack_tuple *tuple,
170 const struct ip_conntrack_protocol *protocol)
172 /* Never happen */
173 if (iph->frag_off & htons(IP_OFFSET)) {
174 printk("ip_conntrack_core: Frag of proto %u.\n",
175 iph->protocol);
176 return 0;
179 tuple->src.ip = iph->saddr;
180 tuple->dst.ip = iph->daddr;
181 tuple->dst.protonum = iph->protocol;
182 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
184 return protocol->pkt_to_tuple(skb, dataoff, tuple);
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189 const struct ip_conntrack_tuple *orig,
190 const struct ip_conntrack_protocol *protocol)
192 inverse->src.ip = orig->dst.ip;
193 inverse->dst.ip = orig->src.ip;
194 inverse->dst.protonum = orig->dst.protonum;
195 inverse->dst.dir = !orig->dst.dir;
197 return protocol->invert_tuple(inverse, orig);
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
204 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205 IP_NF_ASSERT(!timer_pending(&exp->timeout));
206 list_del(&exp->list);
207 CONNTRACK_STAT_INC(expect_delete);
208 exp->master->expecting--;
209 ip_conntrack_expect_put(exp);
212 static void expectation_timed_out(unsigned long ul_expect)
214 struct ip_conntrack_expect *exp = (void *)ul_expect;
216 write_lock_bh(&ip_conntrack_lock);
217 ip_ct_unlink_expect(exp);
218 write_unlock_bh(&ip_conntrack_lock);
219 ip_conntrack_expect_put(exp);
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
225 struct ip_conntrack_expect *i;
227 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
229 atomic_inc(&i->use);
230 return i;
233 return NULL;
236 /* Just find a expectation corresponding to a tuple. */
237 struct ip_conntrack_expect *
238 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
240 struct ip_conntrack_expect *i;
242 read_lock_bh(&ip_conntrack_lock);
243 i = __ip_conntrack_expect_find(tuple);
244 read_unlock_bh(&ip_conntrack_lock);
246 return i;
249 /* If an expectation for this connection is found, it gets delete from
250 * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
254 struct ip_conntrack_expect *i;
256 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257 /* If master is not in hash table yet (ie. packet hasn't left
258 this machine yet), how can other end know about expected?
259 Hence these are not the droids you are looking for (if
260 master ct never got confirmed, we'd hold a reference to it
261 and weird things would happen to future packets). */
262 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263 && is_confirmed(i->master)) {
264 if (i->flags & IP_CT_EXPECT_PERMANENT) {
265 atomic_inc(&i->use);
266 return i;
267 } else if (del_timer(&i->timeout)) {
268 ip_ct_unlink_expect(i);
269 return i;
273 return NULL;
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
279 struct ip_conntrack_expect *i, *tmp;
281 /* Optimization: most connection never expect any others. */
282 if (ct->expecting == 0)
283 return;
285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286 if (i->master == ct && del_timer(&i->timeout)) {
287 ip_ct_unlink_expect(i);
288 ip_conntrack_expect_put(i);
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
296 DEBUGP("clean_from_lists(%p)\n", ct);
297 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
301 /* Destroy all pending expectations */
302 ip_ct_remove_expectations(ct);
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
308 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309 struct ip_conntrack_protocol *proto;
310 struct ip_conntrack_helper *helper;
312 DEBUGP("destroy_conntrack(%p)\n", ct);
313 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314 IP_NF_ASSERT(!timer_pending(&ct->timeout));
316 ip_conntrack_event(IPCT_DESTROY, ct);
317 set_bit(IPS_DYING_BIT, &ct->status);
319 helper = ct->helper;
320 if (helper && helper->destroy)
321 helper->destroy(ct);
323 /* To make sure we don't get any weird locking issues here:
324 * destroy_conntrack() MUST NOT be called with a write lock
325 * to ip_conntrack_lock!!! -HW */
326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327 if (proto && proto->destroy)
328 proto->destroy(ct);
330 if (ip_conntrack_destroyed)
331 ip_conntrack_destroyed(ct);
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
340 /* We overload first tuple to link into unconfirmed list. */
341 if (!is_confirmed(ct)) {
342 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346 CONNTRACK_STAT_INC(delete);
347 write_unlock_bh(&ip_conntrack_lock);
349 if (ct->master)
350 ip_conntrack_put(ct->master);
352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353 ip_conntrack_free(ct);
356 static void death_by_timeout(unsigned long ul_conntrack)
358 struct ip_conntrack *ct = (void *)ul_conntrack;
360 write_lock_bh(&ip_conntrack_lock);
361 /* Inside lock so preempt is disabled on module removal path.
362 * Otherwise we can get spurious warnings. */
363 CONNTRACK_STAT_INC(delete_list);
364 clean_from_lists(ct);
365 write_unlock_bh(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
369 struct ip_conntrack_tuple_hash *
370 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371 const struct ip_conntrack *ignored_conntrack)
373 struct ip_conntrack_tuple_hash *h;
374 unsigned int hash = hash_conntrack(tuple);
376 ASSERT_READ_LOCK(&ip_conntrack_lock);
377 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
378 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
379 ip_ct_tuple_equal(tuple, &h->tuple)) {
380 CONNTRACK_STAT_INC(found);
381 return h;
383 CONNTRACK_STAT_INC(searched);
386 return NULL;
389 /* Find a connection corresponding to a tuple. */
390 struct ip_conntrack_tuple_hash *
391 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
392 const struct ip_conntrack *ignored_conntrack)
394 struct ip_conntrack_tuple_hash *h;
396 read_lock_bh(&ip_conntrack_lock);
397 h = __ip_conntrack_find(tuple, ignored_conntrack);
398 if (h)
399 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
400 read_unlock_bh(&ip_conntrack_lock);
402 return h;
405 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
406 unsigned int hash,
407 unsigned int repl_hash)
409 ct->id = ++ip_conntrack_next_id;
410 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
411 &ip_conntrack_hash[hash]);
412 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
413 &ip_conntrack_hash[repl_hash]);
416 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
418 unsigned int hash, repl_hash;
420 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
421 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
423 write_lock_bh(&ip_conntrack_lock);
424 __ip_conntrack_hash_insert(ct, hash, repl_hash);
425 write_unlock_bh(&ip_conntrack_lock);
428 /* Confirm a connection given skb; places it in hash table */
430 __ip_conntrack_confirm(struct sk_buff **pskb)
432 unsigned int hash, repl_hash;
433 struct ip_conntrack_tuple_hash *h;
434 struct ip_conntrack *ct;
435 enum ip_conntrack_info ctinfo;
437 ct = ip_conntrack_get(*pskb, &ctinfo);
439 /* ipt_REJECT uses ip_conntrack_attach to attach related
440 ICMP/TCP RST packets in other direction. Actual packet
441 which created connection will be IP_CT_NEW or for an
442 expected connection, IP_CT_RELATED. */
443 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
444 return NF_ACCEPT;
446 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
447 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
449 /* We're not in hash table, and we refuse to set up related
450 connections for unconfirmed conns. But packet copies and
451 REJECT will give spurious warnings here. */
452 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
454 /* No external references means noone else could have
455 confirmed us. */
456 IP_NF_ASSERT(!is_confirmed(ct));
457 DEBUGP("Confirming conntrack %p\n", ct);
459 write_lock_bh(&ip_conntrack_lock);
461 /* See if there's one in the list already, including reverse:
462 NAT could have grabbed it without realizing, since we're
463 not in the hash. If there is, we lost race. */
464 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
465 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
466 &h->tuple))
467 goto out;
468 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
469 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
470 &h->tuple))
471 goto out;
473 /* Remove from unconfirmed list */
474 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
476 __ip_conntrack_hash_insert(ct, hash, repl_hash);
477 /* Timer relative to confirmation time, not original
478 setting time, otherwise we'd get timer wrap in
479 weird delay cases. */
480 ct->timeout.expires += jiffies;
481 add_timer(&ct->timeout);
482 atomic_inc(&ct->ct_general.use);
483 set_bit(IPS_CONFIRMED_BIT, &ct->status);
484 CONNTRACK_STAT_INC(insert);
485 write_unlock_bh(&ip_conntrack_lock);
486 if (ct->helper)
487 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
488 #ifdef CONFIG_IP_NF_NAT_NEEDED
489 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
490 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
491 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
492 #endif
493 ip_conntrack_event_cache(master_ct(ct) ?
494 IPCT_RELATED : IPCT_NEW, *pskb);
496 return NF_ACCEPT;
498 out:
499 CONNTRACK_STAT_INC(insert_failed);
500 write_unlock_bh(&ip_conntrack_lock);
501 return NF_DROP;
504 /* Returns true if a connection correspondings to the tuple (required
505 for NAT). */
507 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
508 const struct ip_conntrack *ignored_conntrack)
510 struct ip_conntrack_tuple_hash *h;
512 read_lock_bh(&ip_conntrack_lock);
513 h = __ip_conntrack_find(tuple, ignored_conntrack);
514 read_unlock_bh(&ip_conntrack_lock);
516 return h != NULL;
519 /* There's a small race here where we may free a just-assured
520 connection. Too bad: we're in trouble anyway. */
521 static int early_drop(struct list_head *chain)
523 /* Traverse backwards: gives us oldest, which is roughly LRU */
524 struct ip_conntrack_tuple_hash *h;
525 struct ip_conntrack *ct = NULL, *tmp;
526 int dropped = 0;
528 read_lock_bh(&ip_conntrack_lock);
529 list_for_each_entry_reverse(h, chain, list) {
530 tmp = tuplehash_to_ctrack(h);
531 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
532 ct = tmp;
533 atomic_inc(&ct->ct_general.use);
534 break;
537 read_unlock_bh(&ip_conntrack_lock);
539 if (!ct)
540 return dropped;
542 if (del_timer(&ct->timeout)) {
543 death_by_timeout((unsigned long)ct);
544 dropped = 1;
545 CONNTRACK_STAT_INC(early_drop);
547 ip_conntrack_put(ct);
548 return dropped;
551 static struct ip_conntrack_helper *
552 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
554 struct ip_conntrack_helper *h;
556 list_for_each_entry(h, &helpers, list) {
557 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
558 return h;
560 return NULL;
563 struct ip_conntrack_helper *
564 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
566 struct ip_conntrack_helper *helper;
568 /* need ip_conntrack_lock to assure that helper exists until
569 * try_module_get() is called */
570 read_lock_bh(&ip_conntrack_lock);
572 helper = __ip_conntrack_helper_find(tuple);
573 if (helper) {
574 /* need to increase module usage count to assure helper will
575 * not go away while the caller is e.g. busy putting a
576 * conntrack in the hash that uses the helper */
577 if (!try_module_get(helper->me))
578 helper = NULL;
581 read_unlock_bh(&ip_conntrack_lock);
583 return helper;
586 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
588 module_put(helper->me);
591 struct ip_conntrack_protocol *
592 __ip_conntrack_proto_find(u_int8_t protocol)
594 return ip_ct_protos[protocol];
597 /* this is guaranteed to always return a valid protocol helper, since
598 * it falls back to generic_protocol */
599 struct ip_conntrack_protocol *
600 ip_conntrack_proto_find_get(u_int8_t protocol)
602 struct ip_conntrack_protocol *p;
604 preempt_disable();
605 p = __ip_conntrack_proto_find(protocol);
606 if (p) {
607 if (!try_module_get(p->me))
608 p = &ip_conntrack_generic_protocol;
610 preempt_enable();
612 return p;
615 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
617 module_put(p->me);
620 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
621 struct ip_conntrack_tuple *repl)
623 struct ip_conntrack *conntrack;
625 if (!ip_conntrack_hash_rnd_initted) {
626 get_random_bytes(&ip_conntrack_hash_rnd, 4);
627 ip_conntrack_hash_rnd_initted = 1;
630 /* We don't want any race condition at early drop stage */
631 atomic_inc(&ip_conntrack_count);
633 if (ip_conntrack_max
634 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
635 unsigned int hash = hash_conntrack(orig);
636 /* Try dropping from this hash chain. */
637 if (!early_drop(&ip_conntrack_hash[hash])) {
638 atomic_dec(&ip_conntrack_count);
639 if (net_ratelimit())
640 printk(KERN_WARNING
641 "ip_conntrack: table full, dropping"
642 " packet.\n");
643 return ERR_PTR(-ENOMEM);
647 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
648 if (!conntrack) {
649 DEBUGP("Can't allocate conntrack.\n");
650 atomic_dec(&ip_conntrack_count);
651 return ERR_PTR(-ENOMEM);
654 memset(conntrack, 0, sizeof(*conntrack));
655 atomic_set(&conntrack->ct_general.use, 1);
656 conntrack->ct_general.destroy = destroy_conntrack;
657 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
658 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
659 /* Don't set timer yet: wait for confirmation */
660 init_timer(&conntrack->timeout);
661 conntrack->timeout.data = (unsigned long)conntrack;
662 conntrack->timeout.function = death_by_timeout;
664 return conntrack;
667 void
668 ip_conntrack_free(struct ip_conntrack *conntrack)
670 atomic_dec(&ip_conntrack_count);
671 kmem_cache_free(ip_conntrack_cachep, conntrack);
674 /* Allocate a new conntrack: we return -ENOMEM if classification
675 * failed due to stress. Otherwise it really is unclassifiable */
676 static struct ip_conntrack_tuple_hash *
677 init_conntrack(struct ip_conntrack_tuple *tuple,
678 struct ip_conntrack_protocol *protocol,
679 struct sk_buff *skb)
681 struct ip_conntrack *conntrack;
682 struct ip_conntrack_tuple repl_tuple;
683 struct ip_conntrack_expect *exp;
685 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
686 DEBUGP("Can't invert tuple.\n");
687 return NULL;
690 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
691 if (conntrack == NULL || IS_ERR(conntrack))
692 return (struct ip_conntrack_tuple_hash *)conntrack;
694 if (!protocol->new(conntrack, skb)) {
695 ip_conntrack_free(conntrack);
696 return NULL;
699 write_lock_bh(&ip_conntrack_lock);
700 exp = find_expectation(tuple);
702 if (exp) {
703 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
704 conntrack, exp);
705 /* Welcome, Mr. Bond. We've been expecting you... */
706 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
707 conntrack->master = exp->master;
708 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
709 conntrack->mark = exp->master->mark;
710 #endif
711 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
712 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
713 /* this is ugly, but there is no other place where to put it */
714 conntrack->nat.masq_index = exp->master->nat.masq_index;
715 #endif
716 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
717 conntrack->secmark = exp->master->secmark;
718 #endif
719 nf_conntrack_get(&conntrack->master->ct_general);
720 CONNTRACK_STAT_INC(expect_new);
721 } else {
722 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
724 CONNTRACK_STAT_INC(new);
727 /* Overload tuple linked list to put us in unconfirmed list. */
728 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
730 write_unlock_bh(&ip_conntrack_lock);
732 if (exp) {
733 if (exp->expectfn)
734 exp->expectfn(conntrack, exp);
735 ip_conntrack_expect_put(exp);
738 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
741 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
742 static inline struct ip_conntrack *
743 resolve_normal_ct(struct sk_buff *skb,
744 struct ip_conntrack_protocol *proto,
745 int *set_reply,
746 unsigned int hooknum,
747 enum ip_conntrack_info *ctinfo)
749 struct ip_conntrack_tuple tuple;
750 struct ip_conntrack_tuple_hash *h;
751 struct ip_conntrack *ct;
753 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
755 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
756 &tuple,proto))
757 return NULL;
759 /* look for tuple match */
760 h = ip_conntrack_find_get(&tuple, NULL);
761 if (!h) {
762 h = init_conntrack(&tuple, proto, skb);
763 if (!h)
764 return NULL;
765 if (IS_ERR(h))
766 return (void *)h;
768 ct = tuplehash_to_ctrack(h);
770 /* It exists; we have (non-exclusive) reference. */
771 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
772 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
773 /* Please set reply bit if this packet OK */
774 *set_reply = 1;
775 } else {
776 /* Once we've had two way comms, always ESTABLISHED. */
777 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
778 DEBUGP("ip_conntrack_in: normal packet for %p\n",
779 ct);
780 *ctinfo = IP_CT_ESTABLISHED;
781 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
782 DEBUGP("ip_conntrack_in: related packet for %p\n",
783 ct);
784 *ctinfo = IP_CT_RELATED;
785 } else {
786 DEBUGP("ip_conntrack_in: new packet for %p\n",
787 ct);
788 *ctinfo = IP_CT_NEW;
790 *set_reply = 0;
792 skb->nfct = &ct->ct_general;
793 skb->nfctinfo = *ctinfo;
794 return ct;
797 /* Netfilter hook itself. */
798 unsigned int ip_conntrack_in(unsigned int hooknum,
799 struct sk_buff **pskb,
800 const struct net_device *in,
801 const struct net_device *out,
802 int (*okfn)(struct sk_buff *))
804 struct ip_conntrack *ct;
805 enum ip_conntrack_info ctinfo;
806 struct ip_conntrack_protocol *proto;
807 int set_reply = 0;
808 int ret;
810 /* Previously seen (loopback or untracked)? Ignore. */
811 if ((*pskb)->nfct) {
812 CONNTRACK_STAT_INC(ignore);
813 return NF_ACCEPT;
816 /* Never happen */
817 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
818 if (net_ratelimit()) {
819 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
820 (*pskb)->nh.iph->protocol, hooknum);
822 return NF_DROP;
825 /* Doesn't cover locally-generated broadcast, so not worth it. */
826 #if 0
827 /* Ignore broadcast: no `connection'. */
828 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
829 printk("Broadcast packet!\n");
830 return NF_ACCEPT;
831 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
832 == htonl(0x000000FF)) {
833 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
834 NIPQUAD((*pskb)->nh.iph->saddr),
835 NIPQUAD((*pskb)->nh.iph->daddr),
836 (*pskb)->sk, (*pskb)->pkt_type);
838 #endif
840 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
842 /* It may be an special packet, error, unclean...
843 * inverse of the return code tells to the netfilter
844 * core what to do with the packet. */
845 if (proto->error != NULL
846 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
847 CONNTRACK_STAT_INC(error);
848 CONNTRACK_STAT_INC(invalid);
849 return -ret;
852 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
853 /* Not valid part of a connection */
854 CONNTRACK_STAT_INC(invalid);
855 return NF_ACCEPT;
858 if (IS_ERR(ct)) {
859 /* Too stressed to deal. */
860 CONNTRACK_STAT_INC(drop);
861 return NF_DROP;
864 IP_NF_ASSERT((*pskb)->nfct);
866 ret = proto->packet(ct, *pskb, ctinfo);
867 if (ret < 0) {
868 /* Invalid: inverse of the return code tells
869 * the netfilter core what to do*/
870 nf_conntrack_put((*pskb)->nfct);
871 (*pskb)->nfct = NULL;
872 CONNTRACK_STAT_INC(invalid);
873 return -ret;
876 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
877 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
879 return ret;
882 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
883 const struct ip_conntrack_tuple *orig)
885 return ip_ct_invert_tuple(inverse, orig,
886 __ip_conntrack_proto_find(orig->dst.protonum));
889 /* Would two expected things clash? */
890 static inline int expect_clash(const struct ip_conntrack_expect *a,
891 const struct ip_conntrack_expect *b)
893 /* Part covered by intersection of masks must be unequal,
894 otherwise they clash */
895 struct ip_conntrack_tuple intersect_mask
896 = { { a->mask.src.ip & b->mask.src.ip,
897 { a->mask.src.u.all & b->mask.src.u.all } },
898 { a->mask.dst.ip & b->mask.dst.ip,
899 { a->mask.dst.u.all & b->mask.dst.u.all },
900 a->mask.dst.protonum & b->mask.dst.protonum } };
902 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
905 static inline int expect_matches(const struct ip_conntrack_expect *a,
906 const struct ip_conntrack_expect *b)
908 return a->master == b->master
909 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
910 && ip_ct_tuple_equal(&a->mask, &b->mask);
913 /* Generally a bad idea to call this: could have matched already. */
914 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
916 struct ip_conntrack_expect *i;
918 write_lock_bh(&ip_conntrack_lock);
919 /* choose the the oldest expectation to evict */
920 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
921 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
922 ip_ct_unlink_expect(i);
923 write_unlock_bh(&ip_conntrack_lock);
924 ip_conntrack_expect_put(i);
925 return;
928 write_unlock_bh(&ip_conntrack_lock);
931 /* We don't increase the master conntrack refcount for non-fulfilled
932 * conntracks. During the conntrack destruction, the expectations are
933 * always killed before the conntrack itself */
934 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
936 struct ip_conntrack_expect *new;
938 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
939 if (!new) {
940 DEBUGP("expect_related: OOM allocating expect\n");
941 return NULL;
943 new->master = me;
944 atomic_set(&new->use, 1);
945 return new;
948 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
950 if (atomic_dec_and_test(&exp->use))
951 kmem_cache_free(ip_conntrack_expect_cachep, exp);
954 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
956 atomic_inc(&exp->use);
957 exp->master->expecting++;
958 list_add(&exp->list, &ip_conntrack_expect_list);
960 init_timer(&exp->timeout);
961 exp->timeout.data = (unsigned long)exp;
962 exp->timeout.function = expectation_timed_out;
963 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
964 add_timer(&exp->timeout);
966 exp->id = ++ip_conntrack_expect_next_id;
967 atomic_inc(&exp->use);
968 CONNTRACK_STAT_INC(expect_create);
971 /* Race with expectations being used means we could have none to find; OK. */
972 static void evict_oldest_expect(struct ip_conntrack *master)
974 struct ip_conntrack_expect *i;
976 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
977 if (i->master == master) {
978 if (del_timer(&i->timeout)) {
979 ip_ct_unlink_expect(i);
980 ip_conntrack_expect_put(i);
982 break;
987 static inline int refresh_timer(struct ip_conntrack_expect *i)
989 if (!del_timer(&i->timeout))
990 return 0;
992 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
993 add_timer(&i->timeout);
994 return 1;
997 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
999 struct ip_conntrack_expect *i;
1000 int ret;
1002 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1003 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1004 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1006 write_lock_bh(&ip_conntrack_lock);
1007 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1008 if (expect_matches(i, expect)) {
1009 /* Refresh timer: if it's dying, ignore.. */
1010 if (refresh_timer(i)) {
1011 ret = 0;
1012 goto out;
1014 } else if (expect_clash(i, expect)) {
1015 ret = -EBUSY;
1016 goto out;
1020 /* Will be over limit? */
1021 if (expect->master->helper->max_expected &&
1022 expect->master->expecting >= expect->master->helper->max_expected)
1023 evict_oldest_expect(expect->master);
1025 ip_conntrack_expect_insert(expect);
1026 ip_conntrack_expect_event(IPEXP_NEW, expect);
1027 ret = 0;
1028 out:
1029 write_unlock_bh(&ip_conntrack_lock);
1030 return ret;
1033 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1034 implicitly racy: see __ip_conntrack_confirm */
1035 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1036 const struct ip_conntrack_tuple *newreply)
1038 write_lock_bh(&ip_conntrack_lock);
1039 /* Should be unconfirmed, so not in hash table yet */
1040 IP_NF_ASSERT(!is_confirmed(conntrack));
1042 DEBUGP("Altering reply tuple of %p to ", conntrack);
1043 DUMP_TUPLE(newreply);
1045 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1046 if (!conntrack->master && conntrack->expecting == 0)
1047 conntrack->helper = __ip_conntrack_helper_find(newreply);
1048 write_unlock_bh(&ip_conntrack_lock);
1051 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1053 BUG_ON(me->timeout == 0);
1054 write_lock_bh(&ip_conntrack_lock);
1055 list_add(&me->list, &helpers);
1056 write_unlock_bh(&ip_conntrack_lock);
1058 return 0;
1061 struct ip_conntrack_helper *
1062 __ip_conntrack_helper_find_byname(const char *name)
1064 struct ip_conntrack_helper *h;
1066 list_for_each_entry(h, &helpers, list) {
1067 if (!strcmp(h->name, name))
1068 return h;
1071 return NULL;
1074 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1075 const struct ip_conntrack_helper *me)
1077 if (tuplehash_to_ctrack(i)->helper == me) {
1078 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1079 tuplehash_to_ctrack(i)->helper = NULL;
1083 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1085 unsigned int i;
1086 struct ip_conntrack_tuple_hash *h;
1087 struct ip_conntrack_expect *exp, *tmp;
1089 /* Need write lock here, to delete helper. */
1090 write_lock_bh(&ip_conntrack_lock);
1091 list_del(&me->list);
1093 /* Get rid of expectations */
1094 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1095 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1096 ip_ct_unlink_expect(exp);
1097 ip_conntrack_expect_put(exp);
1100 /* Get rid of expecteds, set helpers to NULL. */
1101 list_for_each_entry(h, &unconfirmed, list)
1102 unhelp(h, me);
1103 for (i = 0; i < ip_conntrack_htable_size; i++) {
1104 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1105 unhelp(h, me);
1107 write_unlock_bh(&ip_conntrack_lock);
1109 /* Someone could be still looking at the helper in a bh. */
1110 synchronize_net();
1113 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1114 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1115 enum ip_conntrack_info ctinfo,
1116 const struct sk_buff *skb,
1117 unsigned long extra_jiffies,
1118 int do_acct)
1120 int event = 0;
1122 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1123 IP_NF_ASSERT(skb);
1125 write_lock_bh(&ip_conntrack_lock);
1127 /* Only update if this is not a fixed timeout */
1128 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1129 write_unlock_bh(&ip_conntrack_lock);
1130 return;
1133 /* If not in hash table, timer will not be active yet */
1134 if (!is_confirmed(ct)) {
1135 ct->timeout.expires = extra_jiffies;
1136 event = IPCT_REFRESH;
1137 } else {
1138 /* Need del_timer for race avoidance (may already be dying). */
1139 if (del_timer(&ct->timeout)) {
1140 ct->timeout.expires = jiffies + extra_jiffies;
1141 add_timer(&ct->timeout);
1142 event = IPCT_REFRESH;
1146 #ifdef CONFIG_IP_NF_CT_ACCT
1147 if (do_acct) {
1148 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1149 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1150 ntohs(skb->nh.iph->tot_len);
1151 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1152 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1153 event |= IPCT_COUNTER_FILLING;
1155 #endif
1157 write_unlock_bh(&ip_conntrack_lock);
1159 /* must be unlocked when calling event cache */
1160 if (event)
1161 ip_conntrack_event_cache(event, skb);
1164 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1165 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1166 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1167 * in ip_conntrack_core, since we don't want the protocols to autoload
1168 * or depend on ctnetlink */
1169 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1170 const struct ip_conntrack_tuple *tuple)
1172 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1173 &tuple->src.u.tcp.port);
1174 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1175 &tuple->dst.u.tcp.port);
1176 return 0;
1178 nfattr_failure:
1179 return -1;
1182 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1183 struct ip_conntrack_tuple *t)
1185 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1186 return -EINVAL;
1188 t->src.u.tcp.port =
1189 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1190 t->dst.u.tcp.port =
1191 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1193 return 0;
1195 #endif
1197 /* Returns new sk_buff, or NULL */
1198 struct sk_buff *
1199 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1201 skb_orphan(skb);
1203 local_bh_disable();
1204 skb = ip_defrag(skb, user);
1205 local_bh_enable();
1207 if (skb)
1208 ip_send_check(skb->nh.iph);
1209 return skb;
1212 /* Used by ipt_REJECT. */
1213 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1215 struct ip_conntrack *ct;
1216 enum ip_conntrack_info ctinfo;
1218 /* This ICMP is in reverse direction to the packet which caused it */
1219 ct = ip_conntrack_get(skb, &ctinfo);
1221 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1222 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1223 else
1224 ctinfo = IP_CT_RELATED;
1226 /* Attach to new skbuff, and increment count */
1227 nskb->nfct = &ct->ct_general;
1228 nskb->nfctinfo = ctinfo;
1229 nf_conntrack_get(nskb->nfct);
1232 /* Bring out ya dead! */
1233 static struct ip_conntrack *
1234 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1235 void *data, unsigned int *bucket)
1237 struct ip_conntrack_tuple_hash *h;
1238 struct ip_conntrack *ct;
1240 write_lock_bh(&ip_conntrack_lock);
1241 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1242 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1243 ct = tuplehash_to_ctrack(h);
1244 if (iter(ct, data))
1245 goto found;
1248 list_for_each_entry(h, &unconfirmed, list) {
1249 ct = tuplehash_to_ctrack(h);
1250 if (iter(ct, data))
1251 goto found;
1253 write_unlock_bh(&ip_conntrack_lock);
1254 return NULL;
1256 found:
1257 atomic_inc(&ct->ct_general.use);
1258 write_unlock_bh(&ip_conntrack_lock);
1259 return ct;
1262 void
1263 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1265 struct ip_conntrack *ct;
1266 unsigned int bucket = 0;
1268 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1269 /* Time to push up daises... */
1270 if (del_timer(&ct->timeout))
1271 death_by_timeout((unsigned long)ct);
1272 /* ... else the timer will get him soon. */
1274 ip_conntrack_put(ct);
1278 /* Fast function for those who don't want to parse /proc (and I don't
1279 blame them). */
1280 /* Reversing the socket's dst/src point of view gives us the reply
1281 mapping. */
1282 static int
1283 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1285 struct inet_sock *inet = inet_sk(sk);
1286 struct ip_conntrack_tuple_hash *h;
1287 struct ip_conntrack_tuple tuple;
1289 IP_CT_TUPLE_U_BLANK(&tuple);
1290 tuple.src.ip = inet->rcv_saddr;
1291 tuple.src.u.tcp.port = inet->sport;
1292 tuple.dst.ip = inet->daddr;
1293 tuple.dst.u.tcp.port = inet->dport;
1294 tuple.dst.protonum = IPPROTO_TCP;
1296 /* We only do TCP at the moment: is there a better way? */
1297 if (strcmp(sk->sk_prot->name, "TCP")) {
1298 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1299 return -ENOPROTOOPT;
1302 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1303 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1304 *len, sizeof(struct sockaddr_in));
1305 return -EINVAL;
1308 h = ip_conntrack_find_get(&tuple, NULL);
1309 if (h) {
1310 struct sockaddr_in sin;
1311 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1313 sin.sin_family = AF_INET;
1314 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1315 .tuple.dst.u.tcp.port;
1316 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1317 .tuple.dst.ip;
1318 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1320 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1321 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1322 ip_conntrack_put(ct);
1323 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1324 return -EFAULT;
1325 else
1326 return 0;
1328 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1329 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1330 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1331 return -ENOENT;
1334 static struct nf_sockopt_ops so_getorigdst = {
1335 .pf = PF_INET,
1336 .get_optmin = SO_ORIGINAL_DST,
1337 .get_optmax = SO_ORIGINAL_DST+1,
1338 .get = &getorigdst,
1341 static int kill_all(struct ip_conntrack *i, void *data)
1343 return 1;
1346 void ip_conntrack_flush(void)
1348 ip_ct_iterate_cleanup(kill_all, NULL);
1351 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1353 if (vmalloced)
1354 vfree(hash);
1355 else
1356 free_pages((unsigned long)hash,
1357 get_order(sizeof(struct list_head) * size));
1360 /* Mishearing the voices in his head, our hero wonders how he's
1361 supposed to kill the mall. */
1362 void ip_conntrack_cleanup(void)
1364 ip_ct_attach = NULL;
1366 /* This makes sure all current packets have passed through
1367 netfilter framework. Roll on, two-stage module
1368 delete... */
1369 synchronize_net();
1371 ip_ct_event_cache_flush();
1372 i_see_dead_people:
1373 ip_conntrack_flush();
1374 if (atomic_read(&ip_conntrack_count) != 0) {
1375 schedule();
1376 goto i_see_dead_people;
1378 /* wait until all references to ip_conntrack_untracked are dropped */
1379 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1380 schedule();
1382 kmem_cache_destroy(ip_conntrack_cachep);
1383 kmem_cache_destroy(ip_conntrack_expect_cachep);
1384 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1385 ip_conntrack_htable_size);
1386 nf_unregister_sockopt(&so_getorigdst);
1389 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1391 struct list_head *hash;
1392 unsigned int i;
1394 *vmalloced = 0;
1395 hash = (void*)__get_free_pages(GFP_KERNEL,
1396 get_order(sizeof(struct list_head)
1397 * size));
1398 if (!hash) {
1399 *vmalloced = 1;
1400 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1401 hash = vmalloc(sizeof(struct list_head) * size);
1404 if (hash)
1405 for (i = 0; i < size; i++)
1406 INIT_LIST_HEAD(&hash[i]);
1408 return hash;
1411 static int set_hashsize(const char *val, struct kernel_param *kp)
1413 int i, bucket, hashsize, vmalloced;
1414 int old_vmalloced, old_size;
1415 int rnd;
1416 struct list_head *hash, *old_hash;
1417 struct ip_conntrack_tuple_hash *h;
1419 /* On boot, we can set this without any fancy locking. */
1420 if (!ip_conntrack_htable_size)
1421 return param_set_int(val, kp);
1423 hashsize = simple_strtol(val, NULL, 0);
1424 if (!hashsize)
1425 return -EINVAL;
1427 hash = alloc_hashtable(hashsize, &vmalloced);
1428 if (!hash)
1429 return -ENOMEM;
1431 /* We have to rehash for the new table anyway, so we also can
1432 * use a new random seed */
1433 get_random_bytes(&rnd, 4);
1435 write_lock_bh(&ip_conntrack_lock);
1436 for (i = 0; i < ip_conntrack_htable_size; i++) {
1437 while (!list_empty(&ip_conntrack_hash[i])) {
1438 h = list_entry(ip_conntrack_hash[i].next,
1439 struct ip_conntrack_tuple_hash, list);
1440 list_del(&h->list);
1441 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1442 list_add_tail(&h->list, &hash[bucket]);
1445 old_size = ip_conntrack_htable_size;
1446 old_vmalloced = ip_conntrack_vmalloc;
1447 old_hash = ip_conntrack_hash;
1449 ip_conntrack_htable_size = hashsize;
1450 ip_conntrack_vmalloc = vmalloced;
1451 ip_conntrack_hash = hash;
1452 ip_conntrack_hash_rnd = rnd;
1453 write_unlock_bh(&ip_conntrack_lock);
1455 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1456 return 0;
1459 module_param_call(hashsize, set_hashsize, param_get_uint,
1460 &ip_conntrack_htable_size, 0600);
1462 int __init ip_conntrack_init(void)
1464 unsigned int i;
1465 int ret;
1467 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1468 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1469 if (!ip_conntrack_htable_size) {
1470 ip_conntrack_htable_size
1471 = (((num_physpages << PAGE_SHIFT) / 16384)
1472 / sizeof(struct list_head));
1473 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1474 ip_conntrack_htable_size = 8192;
1475 if (ip_conntrack_htable_size < 16)
1476 ip_conntrack_htable_size = 16;
1478 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1480 printk("ip_conntrack version %s (%u buckets, %d max)"
1481 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1482 ip_conntrack_htable_size, ip_conntrack_max,
1483 sizeof(struct ip_conntrack));
1485 ret = nf_register_sockopt(&so_getorigdst);
1486 if (ret != 0) {
1487 printk(KERN_ERR "Unable to register netfilter socket option\n");
1488 return ret;
1491 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1492 &ip_conntrack_vmalloc);
1493 if (!ip_conntrack_hash) {
1494 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1495 goto err_unreg_sockopt;
1498 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1499 sizeof(struct ip_conntrack), 0,
1500 0, NULL, NULL);
1501 if (!ip_conntrack_cachep) {
1502 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1503 goto err_free_hash;
1506 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1507 sizeof(struct ip_conntrack_expect),
1508 0, 0, NULL, NULL);
1509 if (!ip_conntrack_expect_cachep) {
1510 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1511 goto err_free_conntrack_slab;
1514 /* Don't NEED lock here, but good form anyway. */
1515 write_lock_bh(&ip_conntrack_lock);
1516 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1517 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1518 /* Sew in builtin protocols. */
1519 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1520 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1521 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1522 write_unlock_bh(&ip_conntrack_lock);
1524 /* For use by ipt_REJECT */
1525 ip_ct_attach = ip_conntrack_attach;
1527 /* Set up fake conntrack:
1528 - to never be deleted, not in any hashes */
1529 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1530 /* - and look it like as a confirmed connection */
1531 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1533 return ret;
1535 err_free_conntrack_slab:
1536 kmem_cache_destroy(ip_conntrack_cachep);
1537 err_free_hash:
1538 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1539 ip_conntrack_htable_size);
1540 err_unreg_sockopt:
1541 nf_unregister_sockopt(&so_getorigdst);
1543 return -ENOMEM;