[NETFILTER]: fix conntrack refcount leak in unlink_expect()
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
blob9261388d5ac237579726692d2332703a246e013c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43 registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
53 #define IP_CONNTRACK_VERSION "2.3"
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
61 DEFINE_RWLOCK(ip_conntrack_lock);
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
88 /* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
102 /* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
106 struct ip_conntrack_ecache *ecache;
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
117 struct ip_conntrack_ecache *ecache;
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
129 /* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
154 #if 0
155 dump_tuple(tuple);
156 #endif
157 return (jhash_3words(tuple->src.ip,
158 (tuple->dst.ip ^ tuple->dst.protonum),
159 (tuple->src.u.all | (tuple->dst.u.all << 16)),
160 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
164 ip_ct_get_tuple(const struct iphdr *iph,
165 const struct sk_buff *skb,
166 unsigned int dataoff,
167 struct ip_conntrack_tuple *tuple,
168 const struct ip_conntrack_protocol *protocol)
170 /* Never happen */
171 if (iph->frag_off & htons(IP_OFFSET)) {
172 printk("ip_conntrack_core: Frag of proto %u.\n",
173 iph->protocol);
174 return 0;
177 tuple->src.ip = iph->saddr;
178 tuple->dst.ip = iph->daddr;
179 tuple->dst.protonum = iph->protocol;
180 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
182 return protocol->pkt_to_tuple(skb, dataoff, tuple);
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187 const struct ip_conntrack_tuple *orig,
188 const struct ip_conntrack_protocol *protocol)
190 inverse->src.ip = orig->dst.ip;
191 inverse->dst.ip = orig->src.ip;
192 inverse->dst.protonum = orig->dst.protonum;
193 inverse->dst.dir = !orig->dst.dir;
195 return protocol->invert_tuple(inverse, orig);
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
202 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203 IP_NF_ASSERT(!timer_pending(&exp->timeout));
204 list_del(&exp->list);
205 CONNTRACK_STAT_INC(expect_delete);
206 exp->master->expecting--;
207 ip_conntrack_expect_put(exp);
210 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
212 unlink_expect(exp);
213 ip_conntrack_expect_put(exp);
216 static void expectation_timed_out(unsigned long ul_expect)
218 struct ip_conntrack_expect *exp = (void *)ul_expect;
220 write_lock_bh(&ip_conntrack_lock);
221 unlink_expect(exp);
222 write_unlock_bh(&ip_conntrack_lock);
223 ip_conntrack_expect_put(exp);
226 struct ip_conntrack_expect *
227 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
229 struct ip_conntrack_expect *i;
231 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233 atomic_inc(&i->use);
234 return i;
237 return NULL;
240 /* Just find a expectation corresponding to a tuple. */
241 struct ip_conntrack_expect *
242 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
244 struct ip_conntrack_expect *i;
246 read_lock_bh(&ip_conntrack_lock);
247 i = __ip_conntrack_expect_find(tuple);
248 read_unlock_bh(&ip_conntrack_lock);
250 return i;
253 /* If an expectation for this connection is found, it gets delete from
254 * global list then returned. */
255 static struct ip_conntrack_expect *
256 find_expectation(const struct ip_conntrack_tuple *tuple)
258 struct ip_conntrack_expect *i;
260 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261 /* If master is not in hash table yet (ie. packet hasn't left
262 this machine yet), how can other end know about expected?
263 Hence these are not the droids you are looking for (if
264 master ct never got confirmed, we'd hold a reference to it
265 and weird things would happen to future packets). */
266 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
267 && is_confirmed(i->master)
268 && del_timer(&i->timeout)) {
269 unlink_expect(i);
270 return i;
273 return NULL;
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
279 struct ip_conntrack_expect *i, *tmp;
281 /* Optimization: most connection never expect any others. */
282 if (ct->expecting == 0)
283 return;
285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286 if (i->master == ct && del_timer(&i->timeout)) {
287 unlink_expect(i);
288 ip_conntrack_expect_put(i);
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
296 unsigned int ho, hr;
298 DEBUGP("clean_from_lists(%p)\n", ct);
299 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
301 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
302 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
303 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
304 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
306 /* Destroy all pending expectations */
307 ip_ct_remove_expectations(ct);
310 static void
311 destroy_conntrack(struct nf_conntrack *nfct)
313 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
314 struct ip_conntrack_protocol *proto;
316 DEBUGP("destroy_conntrack(%p)\n", ct);
317 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
318 IP_NF_ASSERT(!timer_pending(&ct->timeout));
320 ip_conntrack_event(IPCT_DESTROY, ct);
321 set_bit(IPS_DYING_BIT, &ct->status);
323 /* To make sure we don't get any weird locking issues here:
324 * destroy_conntrack() MUST NOT be called with a write lock
325 * to ip_conntrack_lock!!! -HW */
326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327 if (proto && proto->destroy)
328 proto->destroy(ct);
330 if (ip_conntrack_destroyed)
331 ip_conntrack_destroyed(ct);
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
340 /* We overload first tuple to link into unconfirmed list. */
341 if (!is_confirmed(ct)) {
342 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346 CONNTRACK_STAT_INC(delete);
347 write_unlock_bh(&ip_conntrack_lock);
349 if (ct->master)
350 ip_conntrack_put(ct->master);
352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353 ip_conntrack_free(ct);
356 static void death_by_timeout(unsigned long ul_conntrack)
358 struct ip_conntrack *ct = (void *)ul_conntrack;
360 write_lock_bh(&ip_conntrack_lock);
361 /* Inside lock so preempt is disabled on module removal path.
362 * Otherwise we can get spurious warnings. */
363 CONNTRACK_STAT_INC(delete_list);
364 clean_from_lists(ct);
365 write_unlock_bh(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
369 static inline int
370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371 const struct ip_conntrack_tuple *tuple,
372 const struct ip_conntrack *ignored_conntrack)
374 ASSERT_READ_LOCK(&ip_conntrack_lock);
375 return tuplehash_to_ctrack(i) != ignored_conntrack
376 && ip_ct_tuple_equal(tuple, &i->tuple);
379 struct ip_conntrack_tuple_hash *
380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381 const struct ip_conntrack *ignored_conntrack)
383 struct ip_conntrack_tuple_hash *h;
384 unsigned int hash = hash_conntrack(tuple);
386 ASSERT_READ_LOCK(&ip_conntrack_lock);
387 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
388 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
389 CONNTRACK_STAT_INC(found);
390 return h;
392 CONNTRACK_STAT_INC(searched);
395 return NULL;
398 /* Find a connection corresponding to a tuple. */
399 struct ip_conntrack_tuple_hash *
400 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
401 const struct ip_conntrack *ignored_conntrack)
403 struct ip_conntrack_tuple_hash *h;
405 read_lock_bh(&ip_conntrack_lock);
406 h = __ip_conntrack_find(tuple, ignored_conntrack);
407 if (h)
408 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
409 read_unlock_bh(&ip_conntrack_lock);
411 return h;
414 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415 unsigned int hash,
416 unsigned int repl_hash)
418 ct->id = ++ip_conntrack_next_id;
419 list_prepend(&ip_conntrack_hash[hash],
420 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421 list_prepend(&ip_conntrack_hash[repl_hash],
422 &ct->tuplehash[IP_CT_DIR_REPLY].list);
425 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
427 unsigned int hash, repl_hash;
429 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
432 write_lock_bh(&ip_conntrack_lock);
433 __ip_conntrack_hash_insert(ct, hash, repl_hash);
434 write_unlock_bh(&ip_conntrack_lock);
437 /* Confirm a connection given skb; places it in hash table */
439 __ip_conntrack_confirm(struct sk_buff **pskb)
441 unsigned int hash, repl_hash;
442 struct ip_conntrack *ct;
443 enum ip_conntrack_info ctinfo;
445 ct = ip_conntrack_get(*pskb, &ctinfo);
447 /* ipt_REJECT uses ip_conntrack_attach to attach related
448 ICMP/TCP RST packets in other direction. Actual packet
449 which created connection will be IP_CT_NEW or for an
450 expected connection, IP_CT_RELATED. */
451 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
452 return NF_ACCEPT;
454 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
455 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
457 /* We're not in hash table, and we refuse to set up related
458 connections for unconfirmed conns. But packet copies and
459 REJECT will give spurious warnings here. */
460 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
462 /* No external references means noone else could have
463 confirmed us. */
464 IP_NF_ASSERT(!is_confirmed(ct));
465 DEBUGP("Confirming conntrack %p\n", ct);
467 write_lock_bh(&ip_conntrack_lock);
469 /* See if there's one in the list already, including reverse:
470 NAT could have grabbed it without realizing, since we're
471 not in the hash. If there is, we lost race. */
472 if (!LIST_FIND(&ip_conntrack_hash[hash],
473 conntrack_tuple_cmp,
474 struct ip_conntrack_tuple_hash *,
475 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
476 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
477 conntrack_tuple_cmp,
478 struct ip_conntrack_tuple_hash *,
479 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
480 /* Remove from unconfirmed list */
481 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
483 __ip_conntrack_hash_insert(ct, hash, repl_hash);
484 /* Timer relative to confirmation time, not original
485 setting time, otherwise we'd get timer wrap in
486 weird delay cases. */
487 ct->timeout.expires += jiffies;
488 add_timer(&ct->timeout);
489 atomic_inc(&ct->ct_general.use);
490 set_bit(IPS_CONFIRMED_BIT, &ct->status);
491 CONNTRACK_STAT_INC(insert);
492 write_unlock_bh(&ip_conntrack_lock);
493 if (ct->helper)
494 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499 #endif
500 ip_conntrack_event_cache(master_ct(ct) ?
501 IPCT_RELATED : IPCT_NEW, *pskb);
503 return NF_ACCEPT;
506 CONNTRACK_STAT_INC(insert_failed);
507 write_unlock_bh(&ip_conntrack_lock);
509 return NF_DROP;
512 /* Returns true if a connection correspondings to the tuple (required
513 for NAT). */
515 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
516 const struct ip_conntrack *ignored_conntrack)
518 struct ip_conntrack_tuple_hash *h;
520 read_lock_bh(&ip_conntrack_lock);
521 h = __ip_conntrack_find(tuple, ignored_conntrack);
522 read_unlock_bh(&ip_conntrack_lock);
524 return h != NULL;
527 /* There's a small race here where we may free a just-assured
528 connection. Too bad: we're in trouble anyway. */
529 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
531 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
534 static int early_drop(struct list_head *chain)
536 /* Traverse backwards: gives us oldest, which is roughly LRU */
537 struct ip_conntrack_tuple_hash *h;
538 struct ip_conntrack *ct = NULL;
539 int dropped = 0;
541 read_lock_bh(&ip_conntrack_lock);
542 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
543 if (h) {
544 ct = tuplehash_to_ctrack(h);
545 atomic_inc(&ct->ct_general.use);
547 read_unlock_bh(&ip_conntrack_lock);
549 if (!ct)
550 return dropped;
552 if (del_timer(&ct->timeout)) {
553 death_by_timeout((unsigned long)ct);
554 dropped = 1;
555 CONNTRACK_STAT_INC(early_drop);
557 ip_conntrack_put(ct);
558 return dropped;
561 static inline int helper_cmp(const struct ip_conntrack_helper *i,
562 const struct ip_conntrack_tuple *rtuple)
564 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
567 static struct ip_conntrack_helper *
568 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
570 return LIST_FIND(&helpers, helper_cmp,
571 struct ip_conntrack_helper *,
572 tuple);
575 struct ip_conntrack_helper *
576 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
578 struct ip_conntrack_helper *helper;
580 /* need ip_conntrack_lock to assure that helper exists until
581 * try_module_get() is called */
582 read_lock_bh(&ip_conntrack_lock);
584 helper = __ip_conntrack_helper_find(tuple);
585 if (helper) {
586 /* need to increase module usage count to assure helper will
587 * not go away while the caller is e.g. busy putting a
588 * conntrack in the hash that uses the helper */
589 if (!try_module_get(helper->me))
590 helper = NULL;
593 read_unlock_bh(&ip_conntrack_lock);
595 return helper;
598 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
600 module_put(helper->me);
603 struct ip_conntrack_protocol *
604 __ip_conntrack_proto_find(u_int8_t protocol)
606 return ip_ct_protos[protocol];
609 /* this is guaranteed to always return a valid protocol helper, since
610 * it falls back to generic_protocol */
611 struct ip_conntrack_protocol *
612 ip_conntrack_proto_find_get(u_int8_t protocol)
614 struct ip_conntrack_protocol *p;
616 preempt_disable();
617 p = __ip_conntrack_proto_find(protocol);
618 if (p) {
619 if (!try_module_get(p->me))
620 p = &ip_conntrack_generic_protocol;
622 preempt_enable();
624 return p;
627 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
629 module_put(p->me);
632 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633 struct ip_conntrack_tuple *repl)
635 struct ip_conntrack *conntrack;
637 if (!ip_conntrack_hash_rnd_initted) {
638 get_random_bytes(&ip_conntrack_hash_rnd, 4);
639 ip_conntrack_hash_rnd_initted = 1;
642 if (ip_conntrack_max
643 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644 unsigned int hash = hash_conntrack(orig);
645 /* Try dropping from this hash chain. */
646 if (!early_drop(&ip_conntrack_hash[hash])) {
647 if (net_ratelimit())
648 printk(KERN_WARNING
649 "ip_conntrack: table full, dropping"
650 " packet.\n");
651 return ERR_PTR(-ENOMEM);
655 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
656 if (!conntrack) {
657 DEBUGP("Can't allocate conntrack.\n");
658 return NULL;
661 memset(conntrack, 0, sizeof(*conntrack));
662 atomic_set(&conntrack->ct_general.use, 1);
663 conntrack->ct_general.destroy = destroy_conntrack;
664 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
665 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
666 /* Don't set timer yet: wait for confirmation */
667 init_timer(&conntrack->timeout);
668 conntrack->timeout.data = (unsigned long)conntrack;
669 conntrack->timeout.function = death_by_timeout;
671 atomic_inc(&ip_conntrack_count);
673 return conntrack;
676 void
677 ip_conntrack_free(struct ip_conntrack *conntrack)
679 atomic_dec(&ip_conntrack_count);
680 kmem_cache_free(ip_conntrack_cachep, conntrack);
683 /* Allocate a new conntrack: we return -ENOMEM if classification
684 * failed due to stress. Otherwise it really is unclassifiable */
685 static struct ip_conntrack_tuple_hash *
686 init_conntrack(struct ip_conntrack_tuple *tuple,
687 struct ip_conntrack_protocol *protocol,
688 struct sk_buff *skb)
690 struct ip_conntrack *conntrack;
691 struct ip_conntrack_tuple repl_tuple;
692 struct ip_conntrack_expect *exp;
694 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695 DEBUGP("Can't invert tuple.\n");
696 return NULL;
699 if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
700 return NULL;
702 if (!protocol->new(conntrack, skb)) {
703 ip_conntrack_free(conntrack);
704 return NULL;
707 write_lock_bh(&ip_conntrack_lock);
708 exp = find_expectation(tuple);
710 if (exp) {
711 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
712 conntrack, exp);
713 /* Welcome, Mr. Bond. We've been expecting you... */
714 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
715 conntrack->master = exp->master;
716 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
717 conntrack->mark = exp->master->mark;
718 #endif
719 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
720 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
721 /* this is ugly, but there is no other place where to put it */
722 conntrack->nat.masq_index = exp->master->nat.masq_index;
723 #endif
724 nf_conntrack_get(&conntrack->master->ct_general);
725 CONNTRACK_STAT_INC(expect_new);
726 } else {
727 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
729 CONNTRACK_STAT_INC(new);
732 /* Overload tuple linked list to put us in unconfirmed list. */
733 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
735 write_unlock_bh(&ip_conntrack_lock);
737 if (exp) {
738 if (exp->expectfn)
739 exp->expectfn(conntrack, exp);
740 ip_conntrack_expect_put(exp);
743 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
746 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
747 static inline struct ip_conntrack *
748 resolve_normal_ct(struct sk_buff *skb,
749 struct ip_conntrack_protocol *proto,
750 int *set_reply,
751 unsigned int hooknum,
752 enum ip_conntrack_info *ctinfo)
754 struct ip_conntrack_tuple tuple;
755 struct ip_conntrack_tuple_hash *h;
756 struct ip_conntrack *ct;
758 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
760 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
761 &tuple,proto))
762 return NULL;
764 /* look for tuple match */
765 h = ip_conntrack_find_get(&tuple, NULL);
766 if (!h) {
767 h = init_conntrack(&tuple, proto, skb);
768 if (!h)
769 return NULL;
770 if (IS_ERR(h))
771 return (void *)h;
773 ct = tuplehash_to_ctrack(h);
775 /* It exists; we have (non-exclusive) reference. */
776 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
777 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
778 /* Please set reply bit if this packet OK */
779 *set_reply = 1;
780 } else {
781 /* Once we've had two way comms, always ESTABLISHED. */
782 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
783 DEBUGP("ip_conntrack_in: normal packet for %p\n",
784 ct);
785 *ctinfo = IP_CT_ESTABLISHED;
786 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
787 DEBUGP("ip_conntrack_in: related packet for %p\n",
788 ct);
789 *ctinfo = IP_CT_RELATED;
790 } else {
791 DEBUGP("ip_conntrack_in: new packet for %p\n",
792 ct);
793 *ctinfo = IP_CT_NEW;
795 *set_reply = 0;
797 skb->nfct = &ct->ct_general;
798 skb->nfctinfo = *ctinfo;
799 return ct;
802 /* Netfilter hook itself. */
803 unsigned int ip_conntrack_in(unsigned int hooknum,
804 struct sk_buff **pskb,
805 const struct net_device *in,
806 const struct net_device *out,
807 int (*okfn)(struct sk_buff *))
809 struct ip_conntrack *ct;
810 enum ip_conntrack_info ctinfo;
811 struct ip_conntrack_protocol *proto;
812 int set_reply = 0;
813 int ret;
815 /* Previously seen (loopback or untracked)? Ignore. */
816 if ((*pskb)->nfct) {
817 CONNTRACK_STAT_INC(ignore);
818 return NF_ACCEPT;
821 /* Never happen */
822 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
823 if (net_ratelimit()) {
824 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
825 (*pskb)->nh.iph->protocol, hooknum);
827 return NF_DROP;
830 /* Doesn't cover locally-generated broadcast, so not worth it. */
831 #if 0
832 /* Ignore broadcast: no `connection'. */
833 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
834 printk("Broadcast packet!\n");
835 return NF_ACCEPT;
836 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
837 == htonl(0x000000FF)) {
838 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
839 NIPQUAD((*pskb)->nh.iph->saddr),
840 NIPQUAD((*pskb)->nh.iph->daddr),
841 (*pskb)->sk, (*pskb)->pkt_type);
843 #endif
845 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
847 /* It may be an special packet, error, unclean...
848 * inverse of the return code tells to the netfilter
849 * core what to do with the packet. */
850 if (proto->error != NULL
851 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
852 CONNTRACK_STAT_INC(error);
853 CONNTRACK_STAT_INC(invalid);
854 return -ret;
857 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
858 /* Not valid part of a connection */
859 CONNTRACK_STAT_INC(invalid);
860 return NF_ACCEPT;
863 if (IS_ERR(ct)) {
864 /* Too stressed to deal. */
865 CONNTRACK_STAT_INC(drop);
866 return NF_DROP;
869 IP_NF_ASSERT((*pskb)->nfct);
871 ret = proto->packet(ct, *pskb, ctinfo);
872 if (ret < 0) {
873 /* Invalid: inverse of the return code tells
874 * the netfilter core what to do*/
875 nf_conntrack_put((*pskb)->nfct);
876 (*pskb)->nfct = NULL;
877 CONNTRACK_STAT_INC(invalid);
878 return -ret;
881 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
882 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
884 return ret;
887 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
888 const struct ip_conntrack_tuple *orig)
890 return ip_ct_invert_tuple(inverse, orig,
891 __ip_conntrack_proto_find(orig->dst.protonum));
894 /* Would two expected things clash? */
895 static inline int expect_clash(const struct ip_conntrack_expect *a,
896 const struct ip_conntrack_expect *b)
898 /* Part covered by intersection of masks must be unequal,
899 otherwise they clash */
900 struct ip_conntrack_tuple intersect_mask
901 = { { a->mask.src.ip & b->mask.src.ip,
902 { a->mask.src.u.all & b->mask.src.u.all } },
903 { a->mask.dst.ip & b->mask.dst.ip,
904 { a->mask.dst.u.all & b->mask.dst.u.all },
905 a->mask.dst.protonum & b->mask.dst.protonum } };
907 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
910 static inline int expect_matches(const struct ip_conntrack_expect *a,
911 const struct ip_conntrack_expect *b)
913 return a->master == b->master
914 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
915 && ip_ct_tuple_equal(&a->mask, &b->mask);
918 /* Generally a bad idea to call this: could have matched already. */
919 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921 struct ip_conntrack_expect *i;
923 write_lock_bh(&ip_conntrack_lock);
924 /* choose the the oldest expectation to evict */
925 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
926 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
927 unlink_expect(i);
928 write_unlock_bh(&ip_conntrack_lock);
929 ip_conntrack_expect_put(i);
930 return;
933 write_unlock_bh(&ip_conntrack_lock);
936 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
938 struct ip_conntrack_expect *new;
940 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
941 if (!new) {
942 DEBUGP("expect_related: OOM allocating expect\n");
943 return NULL;
945 new->master = me;
946 atomic_inc(&new->master->ct_general.use);
947 atomic_set(&new->use, 1);
948 return new;
951 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
953 if (atomic_dec_and_test(&exp->use)) {
954 ip_conntrack_put(exp->master);
955 kmem_cache_free(ip_conntrack_expect_cachep, exp);
959 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961 atomic_inc(&exp->use);
962 exp->master->expecting++;
963 list_add(&exp->list, &ip_conntrack_expect_list);
965 init_timer(&exp->timeout);
966 exp->timeout.data = (unsigned long)exp;
967 exp->timeout.function = expectation_timed_out;
968 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
969 add_timer(&exp->timeout);
971 exp->id = ++ip_conntrack_expect_next_id;
972 atomic_inc(&exp->use);
973 CONNTRACK_STAT_INC(expect_create);
976 /* Race with expectations being used means we could have none to find; OK. */
977 static void evict_oldest_expect(struct ip_conntrack *master)
979 struct ip_conntrack_expect *i;
981 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
982 if (i->master == master) {
983 if (del_timer(&i->timeout)) {
984 unlink_expect(i);
985 ip_conntrack_expect_put(i);
987 break;
992 static inline int refresh_timer(struct ip_conntrack_expect *i)
994 if (!del_timer(&i->timeout))
995 return 0;
997 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
998 add_timer(&i->timeout);
999 return 1;
1002 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004 struct ip_conntrack_expect *i;
1005 int ret;
1007 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1008 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1009 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1011 write_lock_bh(&ip_conntrack_lock);
1012 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1013 if (expect_matches(i, expect)) {
1014 /* Refresh timer: if it's dying, ignore.. */
1015 if (refresh_timer(i)) {
1016 ret = 0;
1017 goto out;
1019 } else if (expect_clash(i, expect)) {
1020 ret = -EBUSY;
1021 goto out;
1025 /* Will be over limit? */
1026 if (expect->master->helper->max_expected &&
1027 expect->master->expecting >= expect->master->helper->max_expected)
1028 evict_oldest_expect(expect->master);
1030 ip_conntrack_expect_insert(expect);
1031 ip_conntrack_expect_event(IPEXP_NEW, expect);
1032 ret = 0;
1033 out:
1034 write_unlock_bh(&ip_conntrack_lock);
1035 return ret;
1038 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1039 implicitly racy: see __ip_conntrack_confirm */
1040 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1041 const struct ip_conntrack_tuple *newreply)
1043 write_lock_bh(&ip_conntrack_lock);
1044 /* Should be unconfirmed, so not in hash table yet */
1045 IP_NF_ASSERT(!is_confirmed(conntrack));
1047 DEBUGP("Altering reply tuple of %p to ", conntrack);
1048 DUMP_TUPLE(newreply);
1050 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1051 if (!conntrack->master && conntrack->expecting == 0)
1052 conntrack->helper = __ip_conntrack_helper_find(newreply);
1053 write_unlock_bh(&ip_conntrack_lock);
1056 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058 BUG_ON(me->timeout == 0);
1059 write_lock_bh(&ip_conntrack_lock);
1060 list_prepend(&helpers, me);
1061 write_unlock_bh(&ip_conntrack_lock);
1063 return 0;
1066 struct ip_conntrack_helper *
1067 __ip_conntrack_helper_find_byname(const char *name)
1069 struct ip_conntrack_helper *h;
1071 list_for_each_entry(h, &helpers, list) {
1072 if (!strcmp(h->name, name))
1073 return h;
1076 return NULL;
1079 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1080 const struct ip_conntrack_helper *me)
1082 if (tuplehash_to_ctrack(i)->helper == me) {
1083 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1084 tuplehash_to_ctrack(i)->helper = NULL;
1086 return 0;
1089 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1091 unsigned int i;
1092 struct ip_conntrack_expect *exp, *tmp;
1094 /* Need write lock here, to delete helper. */
1095 write_lock_bh(&ip_conntrack_lock);
1096 LIST_DELETE(&helpers, me);
1098 /* Get rid of expectations */
1099 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1100 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1101 unlink_expect(exp);
1102 ip_conntrack_expect_put(exp);
1105 /* Get rid of expecteds, set helpers to NULL. */
1106 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1107 for (i = 0; i < ip_conntrack_htable_size; i++)
1108 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1109 struct ip_conntrack_tuple_hash *, me);
1110 write_unlock_bh(&ip_conntrack_lock);
1112 /* Someone could be still looking at the helper in a bh. */
1113 synchronize_net();
1116 static inline void ct_add_counters(struct ip_conntrack *ct,
1117 enum ip_conntrack_info ctinfo,
1118 const struct sk_buff *skb)
1120 #ifdef CONFIG_IP_NF_CT_ACCT
1121 if (skb) {
1122 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1123 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1124 ntohs(skb->nh.iph->tot_len);
1126 #endif
1129 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1130 void ip_ct_refresh_acct(struct ip_conntrack *ct,
1131 enum ip_conntrack_info ctinfo,
1132 const struct sk_buff *skb,
1133 unsigned long extra_jiffies)
1135 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1137 /* If not in hash table, timer will not be active yet */
1138 if (!is_confirmed(ct)) {
1139 ct->timeout.expires = extra_jiffies;
1140 ct_add_counters(ct, ctinfo, skb);
1141 } else {
1142 write_lock_bh(&ip_conntrack_lock);
1143 /* Need del_timer for race avoidance (may already be dying). */
1144 if (del_timer(&ct->timeout)) {
1145 ct->timeout.expires = jiffies + extra_jiffies;
1146 add_timer(&ct->timeout);
1147 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1149 ct_add_counters(ct, ctinfo, skb);
1150 write_unlock_bh(&ip_conntrack_lock);
1154 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1155 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1156 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1157 * in ip_conntrack_core, since we don't want the protocols to autoload
1158 * or depend on ctnetlink */
1159 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1160 const struct ip_conntrack_tuple *tuple)
1162 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1163 &tuple->src.u.tcp.port);
1164 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1165 &tuple->dst.u.tcp.port);
1166 return 0;
1168 nfattr_failure:
1169 return -1;
1172 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1173 struct ip_conntrack_tuple *t)
1175 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1176 return -EINVAL;
1178 t->src.u.tcp.port =
1179 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1180 t->dst.u.tcp.port =
1181 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1183 return 0;
1185 #endif
1187 /* Returns new sk_buff, or NULL */
1188 struct sk_buff *
1189 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1191 skb_orphan(skb);
1193 local_bh_disable();
1194 skb = ip_defrag(skb, user);
1195 local_bh_enable();
1197 if (skb)
1198 ip_send_check(skb->nh.iph);
1199 return skb;
1202 /* Used by ipt_REJECT. */
1203 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1205 struct ip_conntrack *ct;
1206 enum ip_conntrack_info ctinfo;
1208 /* This ICMP is in reverse direction to the packet which caused it */
1209 ct = ip_conntrack_get(skb, &ctinfo);
1211 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1212 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1213 else
1214 ctinfo = IP_CT_RELATED;
1216 /* Attach to new skbuff, and increment count */
1217 nskb->nfct = &ct->ct_general;
1218 nskb->nfctinfo = ctinfo;
1219 nf_conntrack_get(nskb->nfct);
1222 static inline int
1223 do_iter(const struct ip_conntrack_tuple_hash *i,
1224 int (*iter)(struct ip_conntrack *i, void *data),
1225 void *data)
1227 return iter(tuplehash_to_ctrack(i), data);
1230 /* Bring out ya dead! */
1231 static struct ip_conntrack_tuple_hash *
1232 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1233 void *data, unsigned int *bucket)
1235 struct ip_conntrack_tuple_hash *h = NULL;
1237 write_lock_bh(&ip_conntrack_lock);
1238 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1239 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1240 struct ip_conntrack_tuple_hash *, iter, data);
1241 if (h)
1242 break;
1244 if (!h)
1245 h = LIST_FIND_W(&unconfirmed, do_iter,
1246 struct ip_conntrack_tuple_hash *, iter, data);
1247 if (h)
1248 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1249 write_unlock_bh(&ip_conntrack_lock);
1251 return h;
1254 void
1255 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1257 struct ip_conntrack_tuple_hash *h;
1258 unsigned int bucket = 0;
1260 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1261 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1262 /* Time to push up daises... */
1263 if (del_timer(&ct->timeout))
1264 death_by_timeout((unsigned long)ct);
1265 /* ... else the timer will get him soon. */
1267 ip_conntrack_put(ct);
1271 /* Fast function for those who don't want to parse /proc (and I don't
1272 blame them). */
1273 /* Reversing the socket's dst/src point of view gives us the reply
1274 mapping. */
1275 static int
1276 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278 struct inet_sock *inet = inet_sk(sk);
1279 struct ip_conntrack_tuple_hash *h;
1280 struct ip_conntrack_tuple tuple;
1282 IP_CT_TUPLE_U_BLANK(&tuple);
1283 tuple.src.ip = inet->rcv_saddr;
1284 tuple.src.u.tcp.port = inet->sport;
1285 tuple.dst.ip = inet->daddr;
1286 tuple.dst.u.tcp.port = inet->dport;
1287 tuple.dst.protonum = IPPROTO_TCP;
1289 /* We only do TCP at the moment: is there a better way? */
1290 if (strcmp(sk->sk_prot->name, "TCP")) {
1291 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1292 return -ENOPROTOOPT;
1295 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1296 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1297 *len, sizeof(struct sockaddr_in));
1298 return -EINVAL;
1301 h = ip_conntrack_find_get(&tuple, NULL);
1302 if (h) {
1303 struct sockaddr_in sin;
1304 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306 sin.sin_family = AF_INET;
1307 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1308 .tuple.dst.u.tcp.port;
1309 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310 .tuple.dst.ip;
1312 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1313 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1314 ip_conntrack_put(ct);
1315 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1316 return -EFAULT;
1317 else
1318 return 0;
1320 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1321 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1322 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1323 return -ENOENT;
1326 static struct nf_sockopt_ops so_getorigdst = {
1327 .pf = PF_INET,
1328 .get_optmin = SO_ORIGINAL_DST,
1329 .get_optmax = SO_ORIGINAL_DST+1,
1330 .get = &getorigdst,
1333 static int kill_all(struct ip_conntrack *i, void *data)
1335 return 1;
1338 static void free_conntrack_hash(void)
1340 if (ip_conntrack_vmalloc)
1341 vfree(ip_conntrack_hash);
1342 else
1343 free_pages((unsigned long)ip_conntrack_hash,
1344 get_order(sizeof(struct list_head)
1345 * ip_conntrack_htable_size));
1348 void ip_conntrack_flush()
1350 /* This makes sure all current packets have passed through
1351 netfilter framework. Roll on, two-stage module
1352 delete... */
1353 synchronize_net();
1355 ip_ct_event_cache_flush();
1356 i_see_dead_people:
1357 ip_ct_iterate_cleanup(kill_all, NULL);
1358 if (atomic_read(&ip_conntrack_count) != 0) {
1359 schedule();
1360 goto i_see_dead_people;
1362 /* wait until all references to ip_conntrack_untracked are dropped */
1363 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1364 schedule();
1367 /* Mishearing the voices in his head, our hero wonders how he's
1368 supposed to kill the mall. */
1369 void ip_conntrack_cleanup(void)
1371 ip_ct_attach = NULL;
1372 ip_conntrack_flush();
1373 kmem_cache_destroy(ip_conntrack_cachep);
1374 kmem_cache_destroy(ip_conntrack_expect_cachep);
1375 free_conntrack_hash();
1376 nf_unregister_sockopt(&so_getorigdst);
1379 static int hashsize;
1380 module_param(hashsize, int, 0400);
1382 int __init ip_conntrack_init(void)
1384 unsigned int i;
1385 int ret;
1387 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1388 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1389 if (hashsize) {
1390 ip_conntrack_htable_size = hashsize;
1391 } else {
1392 ip_conntrack_htable_size
1393 = (((num_physpages << PAGE_SHIFT) / 16384)
1394 / sizeof(struct list_head));
1395 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1396 ip_conntrack_htable_size = 8192;
1397 if (ip_conntrack_htable_size < 16)
1398 ip_conntrack_htable_size = 16;
1400 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1402 printk("ip_conntrack version %s (%u buckets, %d max)"
1403 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1404 ip_conntrack_htable_size, ip_conntrack_max,
1405 sizeof(struct ip_conntrack));
1407 ret = nf_register_sockopt(&so_getorigdst);
1408 if (ret != 0) {
1409 printk(KERN_ERR "Unable to register netfilter socket option\n");
1410 return ret;
1413 /* AK: the hash table is twice as big than needed because it
1414 uses list_head. it would be much nicer to caches to use a
1415 single pointer list head here. */
1416 ip_conntrack_vmalloc = 0;
1417 ip_conntrack_hash
1418 =(void*)__get_free_pages(GFP_KERNEL,
1419 get_order(sizeof(struct list_head)
1420 *ip_conntrack_htable_size));
1421 if (!ip_conntrack_hash) {
1422 ip_conntrack_vmalloc = 1;
1423 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1424 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1425 * ip_conntrack_htable_size);
1427 if (!ip_conntrack_hash) {
1428 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1429 goto err_unreg_sockopt;
1432 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1433 sizeof(struct ip_conntrack), 0,
1434 0, NULL, NULL);
1435 if (!ip_conntrack_cachep) {
1436 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1437 goto err_free_hash;
1440 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1441 sizeof(struct ip_conntrack_expect),
1442 0, 0, NULL, NULL);
1443 if (!ip_conntrack_expect_cachep) {
1444 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1445 goto err_free_conntrack_slab;
1448 /* Don't NEED lock here, but good form anyway. */
1449 write_lock_bh(&ip_conntrack_lock);
1450 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1451 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1452 /* Sew in builtin protocols. */
1453 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1454 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1455 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1456 write_unlock_bh(&ip_conntrack_lock);
1458 for (i = 0; i < ip_conntrack_htable_size; i++)
1459 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1461 /* For use by ipt_REJECT */
1462 ip_ct_attach = ip_conntrack_attach;
1464 /* Set up fake conntrack:
1465 - to never be deleted, not in any hashes */
1466 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1467 /* - and look it like as a confirmed connection */
1468 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1470 return ret;
1472 err_free_conntrack_slab:
1473 kmem_cache_destroy(ip_conntrack_cachep);
1474 err_free_hash:
1475 free_conntrack_hash();
1476 err_unreg_sockopt:
1477 nf_unregister_sockopt(&so_getorigdst);
1479 return -ENOMEM;