[IP]: Introduce ip_hdrlen()
[linux-2.6/openmoko-kernel/knife-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
blob8c013d9f6907c380f0438164eb8dcbd7d7ac5dd3
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
48 #define IP_CONNTRACK_VERSION "2.4"
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
56 DEFINE_RWLOCK(ip_conntrack_lock);
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
83 /* deliver cached events and clear cache entry - must be called with locally
84 * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
88 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91 ecache->ct);
92 ecache->events = 0;
93 ip_conntrack_put(ecache->ct);
94 ecache->ct = NULL;
97 /* Deliver all cached events for a particular conntrack. This is called
98 * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
101 struct ip_conntrack_ecache *ecache;
103 local_bh_disable();
104 ecache = &__get_cpu_var(ip_conntrack_ecache);
105 if (ecache->ct == ct)
106 __ip_ct_deliver_cached_events(ecache);
107 local_bh_enable();
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
112 struct ip_conntrack_ecache *ecache;
114 /* take care of delivering potentially old events */
115 ecache = &__get_cpu_var(ip_conntrack_ecache);
116 BUG_ON(ecache->ct == ct);
117 if (ecache->ct)
118 __ip_ct_deliver_cached_events(ecache);
119 /* initialize for this conntrack/packet */
120 ecache->ct = ct;
121 nf_conntrack_get(&ct->ct_general);
124 /* flush the event cache - touches other CPU's data and must not be called while
125 * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
128 struct ip_conntrack_ecache *ecache;
129 int cpu;
131 for_each_possible_cpu(cpu) {
132 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133 if (ecache->ct)
134 ip_conntrack_put(ecache->ct);
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147 unsigned int size, unsigned int rnd)
149 return (jhash_3words((__force u32)tuple->src.ip,
150 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151 (tuple->src.u.all | (tuple->dst.u.all << 16)),
152 rnd) % size);
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
158 return __hash_conntrack(tuple, ip_conntrack_htable_size,
159 ip_conntrack_hash_rnd);
163 ip_ct_get_tuple(const struct iphdr *iph,
164 const struct sk_buff *skb,
165 unsigned int dataoff,
166 struct ip_conntrack_tuple *tuple,
167 const struct ip_conntrack_protocol *protocol)
169 /* Never happen */
170 if (iph->frag_off & htons(IP_OFFSET)) {
171 printk("ip_conntrack_core: Frag of proto %u.\n",
172 iph->protocol);
173 return 0;
176 tuple->src.ip = iph->saddr;
177 tuple->dst.ip = iph->daddr;
178 tuple->dst.protonum = iph->protocol;
179 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181 return protocol->pkt_to_tuple(skb, dataoff, tuple);
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186 const struct ip_conntrack_tuple *orig,
187 const struct ip_conntrack_protocol *protocol)
189 inverse->src.ip = orig->dst.ip;
190 inverse->dst.ip = orig->src.ip;
191 inverse->dst.protonum = orig->dst.protonum;
192 inverse->dst.dir = !orig->dst.dir;
194 return protocol->invert_tuple(inverse, orig);
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
201 IP_NF_ASSERT(!timer_pending(&exp->timeout));
202 list_del(&exp->list);
203 CONNTRACK_STAT_INC(expect_delete);
204 exp->master->expecting--;
205 ip_conntrack_expect_put(exp);
208 static void expectation_timed_out(unsigned long ul_expect)
210 struct ip_conntrack_expect *exp = (void *)ul_expect;
212 write_lock_bh(&ip_conntrack_lock);
213 ip_ct_unlink_expect(exp);
214 write_unlock_bh(&ip_conntrack_lock);
215 ip_conntrack_expect_put(exp);
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
221 struct ip_conntrack_expect *i;
223 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225 return i;
227 return NULL;
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
234 struct ip_conntrack_expect *i;
236 read_lock_bh(&ip_conntrack_lock);
237 i = __ip_conntrack_expect_find(tuple);
238 if (i)
239 atomic_inc(&i->use);
240 read_unlock_bh(&ip_conntrack_lock);
242 return i;
245 /* If an expectation for this connection is found, it gets delete from
246 * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
250 struct ip_conntrack_expect *i;
252 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253 /* If master is not in hash table yet (ie. packet hasn't left
254 this machine yet), how can other end know about expected?
255 Hence these are not the droids you are looking for (if
256 master ct never got confirmed, we'd hold a reference to it
257 and weird things would happen to future packets). */
258 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259 && is_confirmed(i->master)) {
260 if (i->flags & IP_CT_EXPECT_PERMANENT) {
261 atomic_inc(&i->use);
262 return i;
263 } else if (del_timer(&i->timeout)) {
264 ip_ct_unlink_expect(i);
265 return i;
269 return NULL;
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
275 struct ip_conntrack_expect *i, *tmp;
277 /* Optimization: most connection never expect any others. */
278 if (ct->expecting == 0)
279 return;
281 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282 if (i->master == ct && del_timer(&i->timeout)) {
283 ip_ct_unlink_expect(i);
284 ip_conntrack_expect_put(i);
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
292 DEBUGP("clean_from_lists(%p)\n", ct);
293 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
296 /* Destroy all pending expectations */
297 ip_ct_remove_expectations(ct);
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
303 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304 struct ip_conntrack_protocol *proto;
305 struct ip_conntrack_helper *helper;
306 typeof(ip_conntrack_destroyed) destroyed;
308 DEBUGP("destroy_conntrack(%p)\n", ct);
309 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
310 IP_NF_ASSERT(!timer_pending(&ct->timeout));
312 ip_conntrack_event(IPCT_DESTROY, ct);
313 set_bit(IPS_DYING_BIT, &ct->status);
315 helper = ct->helper;
316 if (helper && helper->destroy)
317 helper->destroy(ct);
319 /* To make sure we don't get any weird locking issues here:
320 * destroy_conntrack() MUST NOT be called with a write lock
321 * to ip_conntrack_lock!!! -HW */
322 rcu_read_lock();
323 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
324 if (proto && proto->destroy)
325 proto->destroy(ct);
327 destroyed = rcu_dereference(ip_conntrack_destroyed);
328 if (destroyed)
329 destroyed(ct);
331 rcu_read_unlock();
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
340 /* We overload first tuple to link into unconfirmed list. */
341 if (!is_confirmed(ct)) {
342 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346 CONNTRACK_STAT_INC(delete);
347 write_unlock_bh(&ip_conntrack_lock);
349 if (ct->master)
350 ip_conntrack_put(ct->master);
352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353 ip_conntrack_free(ct);
356 static void death_by_timeout(unsigned long ul_conntrack)
358 struct ip_conntrack *ct = (void *)ul_conntrack;
360 write_lock_bh(&ip_conntrack_lock);
361 /* Inside lock so preempt is disabled on module removal path.
362 * Otherwise we can get spurious warnings. */
363 CONNTRACK_STAT_INC(delete_list);
364 clean_from_lists(ct);
365 write_unlock_bh(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
369 struct ip_conntrack_tuple_hash *
370 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371 const struct ip_conntrack *ignored_conntrack)
373 struct ip_conntrack_tuple_hash *h;
374 unsigned int hash = hash_conntrack(tuple);
376 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
377 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
378 ip_ct_tuple_equal(tuple, &h->tuple)) {
379 CONNTRACK_STAT_INC(found);
380 return h;
382 CONNTRACK_STAT_INC(searched);
385 return NULL;
388 /* Find a connection corresponding to a tuple. */
389 struct ip_conntrack_tuple_hash *
390 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
391 const struct ip_conntrack *ignored_conntrack)
393 struct ip_conntrack_tuple_hash *h;
395 read_lock_bh(&ip_conntrack_lock);
396 h = __ip_conntrack_find(tuple, ignored_conntrack);
397 if (h)
398 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
399 read_unlock_bh(&ip_conntrack_lock);
401 return h;
404 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
405 unsigned int hash,
406 unsigned int repl_hash)
408 ct->id = ++ip_conntrack_next_id;
409 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
410 &ip_conntrack_hash[hash]);
411 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
412 &ip_conntrack_hash[repl_hash]);
415 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
417 unsigned int hash, repl_hash;
419 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
420 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
422 write_lock_bh(&ip_conntrack_lock);
423 __ip_conntrack_hash_insert(ct, hash, repl_hash);
424 write_unlock_bh(&ip_conntrack_lock);
427 /* Confirm a connection given skb; places it in hash table */
429 __ip_conntrack_confirm(struct sk_buff **pskb)
431 unsigned int hash, repl_hash;
432 struct ip_conntrack_tuple_hash *h;
433 struct ip_conntrack *ct;
434 enum ip_conntrack_info ctinfo;
436 ct = ip_conntrack_get(*pskb, &ctinfo);
438 /* ipt_REJECT uses ip_conntrack_attach to attach related
439 ICMP/TCP RST packets in other direction. Actual packet
440 which created connection will be IP_CT_NEW or for an
441 expected connection, IP_CT_RELATED. */
442 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
443 return NF_ACCEPT;
445 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
446 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
448 /* We're not in hash table, and we refuse to set up related
449 connections for unconfirmed conns. But packet copies and
450 REJECT will give spurious warnings here. */
451 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
453 /* No external references means noone else could have
454 confirmed us. */
455 IP_NF_ASSERT(!is_confirmed(ct));
456 DEBUGP("Confirming conntrack %p\n", ct);
458 write_lock_bh(&ip_conntrack_lock);
460 /* See if there's one in the list already, including reverse:
461 NAT could have grabbed it without realizing, since we're
462 not in the hash. If there is, we lost race. */
463 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
464 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
465 &h->tuple))
466 goto out;
467 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
468 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
469 &h->tuple))
470 goto out;
472 /* Remove from unconfirmed list */
473 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
475 __ip_conntrack_hash_insert(ct, hash, repl_hash);
476 /* Timer relative to confirmation time, not original
477 setting time, otherwise we'd get timer wrap in
478 weird delay cases. */
479 ct->timeout.expires += jiffies;
480 add_timer(&ct->timeout);
481 atomic_inc(&ct->ct_general.use);
482 set_bit(IPS_CONFIRMED_BIT, &ct->status);
483 CONNTRACK_STAT_INC(insert);
484 write_unlock_bh(&ip_conntrack_lock);
485 if (ct->helper)
486 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
487 #ifdef CONFIG_IP_NF_NAT_NEEDED
488 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
489 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
490 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
491 #endif
492 ip_conntrack_event_cache(master_ct(ct) ?
493 IPCT_RELATED : IPCT_NEW, *pskb);
495 return NF_ACCEPT;
497 out:
498 CONNTRACK_STAT_INC(insert_failed);
499 write_unlock_bh(&ip_conntrack_lock);
500 return NF_DROP;
503 /* Returns true if a connection correspondings to the tuple (required
504 for NAT). */
506 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
507 const struct ip_conntrack *ignored_conntrack)
509 struct ip_conntrack_tuple_hash *h;
511 read_lock_bh(&ip_conntrack_lock);
512 h = __ip_conntrack_find(tuple, ignored_conntrack);
513 read_unlock_bh(&ip_conntrack_lock);
515 return h != NULL;
518 /* There's a small race here where we may free a just-assured
519 connection. Too bad: we're in trouble anyway. */
520 static int early_drop(struct list_head *chain)
522 /* Traverse backwards: gives us oldest, which is roughly LRU */
523 struct ip_conntrack_tuple_hash *h;
524 struct ip_conntrack *ct = NULL, *tmp;
525 int dropped = 0;
527 read_lock_bh(&ip_conntrack_lock);
528 list_for_each_entry_reverse(h, chain, list) {
529 tmp = tuplehash_to_ctrack(h);
530 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
531 ct = tmp;
532 atomic_inc(&ct->ct_general.use);
533 break;
536 read_unlock_bh(&ip_conntrack_lock);
538 if (!ct)
539 return dropped;
541 if (del_timer(&ct->timeout)) {
542 death_by_timeout((unsigned long)ct);
543 dropped = 1;
544 CONNTRACK_STAT_INC_ATOMIC(early_drop);
546 ip_conntrack_put(ct);
547 return dropped;
550 static struct ip_conntrack_helper *
551 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
553 struct ip_conntrack_helper *h;
555 list_for_each_entry(h, &helpers, list) {
556 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
557 return h;
559 return NULL;
562 struct ip_conntrack_helper *
563 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
565 struct ip_conntrack_helper *helper;
567 /* need ip_conntrack_lock to assure that helper exists until
568 * try_module_get() is called */
569 read_lock_bh(&ip_conntrack_lock);
571 helper = __ip_conntrack_helper_find(tuple);
572 if (helper) {
573 /* need to increase module usage count to assure helper will
574 * not go away while the caller is e.g. busy putting a
575 * conntrack in the hash that uses the helper */
576 if (!try_module_get(helper->me))
577 helper = NULL;
580 read_unlock_bh(&ip_conntrack_lock);
582 return helper;
585 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
587 module_put(helper->me);
590 struct ip_conntrack_protocol *
591 __ip_conntrack_proto_find(u_int8_t protocol)
593 return ip_ct_protos[protocol];
596 /* this is guaranteed to always return a valid protocol helper, since
597 * it falls back to generic_protocol */
598 struct ip_conntrack_protocol *
599 ip_conntrack_proto_find_get(u_int8_t protocol)
601 struct ip_conntrack_protocol *p;
603 rcu_read_lock();
604 p = __ip_conntrack_proto_find(protocol);
605 if (p) {
606 if (!try_module_get(p->me))
607 p = &ip_conntrack_generic_protocol;
609 rcu_read_unlock();
611 return p;
614 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
616 module_put(p->me);
619 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
620 struct ip_conntrack_tuple *repl)
622 struct ip_conntrack *conntrack;
624 if (!ip_conntrack_hash_rnd_initted) {
625 get_random_bytes(&ip_conntrack_hash_rnd, 4);
626 ip_conntrack_hash_rnd_initted = 1;
629 /* We don't want any race condition at early drop stage */
630 atomic_inc(&ip_conntrack_count);
632 if (ip_conntrack_max
633 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
634 unsigned int hash = hash_conntrack(orig);
635 /* Try dropping from this hash chain. */
636 if (!early_drop(&ip_conntrack_hash[hash])) {
637 atomic_dec(&ip_conntrack_count);
638 if (net_ratelimit())
639 printk(KERN_WARNING
640 "ip_conntrack: table full, dropping"
641 " packet.\n");
642 return ERR_PTR(-ENOMEM);
646 conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
647 if (!conntrack) {
648 DEBUGP("Can't allocate conntrack.\n");
649 atomic_dec(&ip_conntrack_count);
650 return ERR_PTR(-ENOMEM);
653 atomic_set(&conntrack->ct_general.use, 1);
654 conntrack->ct_general.destroy = destroy_conntrack;
655 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
656 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
657 /* Don't set timer yet: wait for confirmation */
658 init_timer(&conntrack->timeout);
659 conntrack->timeout.data = (unsigned long)conntrack;
660 conntrack->timeout.function = death_by_timeout;
662 return conntrack;
665 void
666 ip_conntrack_free(struct ip_conntrack *conntrack)
668 atomic_dec(&ip_conntrack_count);
669 kmem_cache_free(ip_conntrack_cachep, conntrack);
672 /* Allocate a new conntrack: we return -ENOMEM if classification
673 * failed due to stress. Otherwise it really is unclassifiable */
674 static struct ip_conntrack_tuple_hash *
675 init_conntrack(struct ip_conntrack_tuple *tuple,
676 struct ip_conntrack_protocol *protocol,
677 struct sk_buff *skb)
679 struct ip_conntrack *conntrack;
680 struct ip_conntrack_tuple repl_tuple;
681 struct ip_conntrack_expect *exp;
683 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
684 DEBUGP("Can't invert tuple.\n");
685 return NULL;
688 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
689 if (conntrack == NULL || IS_ERR(conntrack))
690 return (struct ip_conntrack_tuple_hash *)conntrack;
692 if (!protocol->new(conntrack, skb)) {
693 ip_conntrack_free(conntrack);
694 return NULL;
697 write_lock_bh(&ip_conntrack_lock);
698 exp = find_expectation(tuple);
700 if (exp) {
701 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
702 conntrack, exp);
703 /* Welcome, Mr. Bond. We've been expecting you... */
704 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
705 conntrack->master = exp->master;
706 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
707 conntrack->mark = exp->master->mark;
708 #endif
709 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
710 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
711 /* this is ugly, but there is no other place where to put it */
712 conntrack->nat.masq_index = exp->master->nat.masq_index;
713 #endif
714 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
715 conntrack->secmark = exp->master->secmark;
716 #endif
717 nf_conntrack_get(&conntrack->master->ct_general);
718 CONNTRACK_STAT_INC(expect_new);
719 } else {
720 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
722 CONNTRACK_STAT_INC(new);
725 /* Overload tuple linked list to put us in unconfirmed list. */
726 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
728 write_unlock_bh(&ip_conntrack_lock);
730 if (exp) {
731 if (exp->expectfn)
732 exp->expectfn(conntrack, exp);
733 ip_conntrack_expect_put(exp);
736 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
739 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
740 static inline struct ip_conntrack *
741 resolve_normal_ct(struct sk_buff *skb,
742 struct ip_conntrack_protocol *proto,
743 int *set_reply,
744 unsigned int hooknum,
745 enum ip_conntrack_info *ctinfo)
747 struct ip_conntrack_tuple tuple;
748 struct ip_conntrack_tuple_hash *h;
749 struct ip_conntrack *ct;
751 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
753 if (!ip_ct_get_tuple(skb->nh.iph, skb, ip_hdrlen(skb), &tuple,proto))
754 return NULL;
756 /* look for tuple match */
757 h = ip_conntrack_find_get(&tuple, NULL);
758 if (!h) {
759 h = init_conntrack(&tuple, proto, skb);
760 if (!h)
761 return NULL;
762 if (IS_ERR(h))
763 return (void *)h;
765 ct = tuplehash_to_ctrack(h);
767 /* It exists; we have (non-exclusive) reference. */
768 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
769 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
770 /* Please set reply bit if this packet OK */
771 *set_reply = 1;
772 } else {
773 /* Once we've had two way comms, always ESTABLISHED. */
774 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
775 DEBUGP("ip_conntrack_in: normal packet for %p\n",
776 ct);
777 *ctinfo = IP_CT_ESTABLISHED;
778 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
779 DEBUGP("ip_conntrack_in: related packet for %p\n",
780 ct);
781 *ctinfo = IP_CT_RELATED;
782 } else {
783 DEBUGP("ip_conntrack_in: new packet for %p\n",
784 ct);
785 *ctinfo = IP_CT_NEW;
787 *set_reply = 0;
789 skb->nfct = &ct->ct_general;
790 skb->nfctinfo = *ctinfo;
791 return ct;
794 /* Netfilter hook itself. */
795 unsigned int ip_conntrack_in(unsigned int hooknum,
796 struct sk_buff **pskb,
797 const struct net_device *in,
798 const struct net_device *out,
799 int (*okfn)(struct sk_buff *))
801 struct ip_conntrack *ct;
802 enum ip_conntrack_info ctinfo;
803 struct ip_conntrack_protocol *proto;
804 int set_reply = 0;
805 int ret;
807 /* Previously seen (loopback or untracked)? Ignore. */
808 if ((*pskb)->nfct) {
809 CONNTRACK_STAT_INC_ATOMIC(ignore);
810 return NF_ACCEPT;
813 /* Never happen */
814 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
815 if (net_ratelimit()) {
816 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
817 (*pskb)->nh.iph->protocol, hooknum);
819 return NF_DROP;
822 /* Doesn't cover locally-generated broadcast, so not worth it. */
823 #if 0
824 /* Ignore broadcast: no `connection'. */
825 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
826 printk("Broadcast packet!\n");
827 return NF_ACCEPT;
828 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
829 == htonl(0x000000FF)) {
830 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
831 NIPQUAD((*pskb)->nh.iph->saddr),
832 NIPQUAD((*pskb)->nh.iph->daddr),
833 (*pskb)->sk, (*pskb)->pkt_type);
835 #endif
837 /* rcu_read_lock()ed by nf_hook_slow */
838 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
840 /* It may be an special packet, error, unclean...
841 * inverse of the return code tells to the netfilter
842 * core what to do with the packet. */
843 if (proto->error != NULL
844 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
845 CONNTRACK_STAT_INC_ATOMIC(error);
846 CONNTRACK_STAT_INC_ATOMIC(invalid);
847 return -ret;
850 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
851 /* Not valid part of a connection */
852 CONNTRACK_STAT_INC_ATOMIC(invalid);
853 return NF_ACCEPT;
856 if (IS_ERR(ct)) {
857 /* Too stressed to deal. */
858 CONNTRACK_STAT_INC_ATOMIC(drop);
859 return NF_DROP;
862 IP_NF_ASSERT((*pskb)->nfct);
864 ret = proto->packet(ct, *pskb, ctinfo);
865 if (ret < 0) {
866 /* Invalid: inverse of the return code tells
867 * the netfilter core what to do*/
868 nf_conntrack_put((*pskb)->nfct);
869 (*pskb)->nfct = NULL;
870 CONNTRACK_STAT_INC_ATOMIC(invalid);
871 return -ret;
874 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
875 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
877 return ret;
880 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
881 const struct ip_conntrack_tuple *orig)
883 struct ip_conntrack_protocol *proto;
884 int ret;
886 rcu_read_lock();
887 proto = __ip_conntrack_proto_find(orig->dst.protonum);
888 ret = ip_ct_invert_tuple(inverse, orig, proto);
889 rcu_read_unlock();
891 return ret;
894 /* Would two expected things clash? */
895 static inline int expect_clash(const struct ip_conntrack_expect *a,
896 const struct ip_conntrack_expect *b)
898 /* Part covered by intersection of masks must be unequal,
899 otherwise they clash */
900 struct ip_conntrack_tuple intersect_mask
901 = { { a->mask.src.ip & b->mask.src.ip,
902 { a->mask.src.u.all & b->mask.src.u.all } },
903 { a->mask.dst.ip & b->mask.dst.ip,
904 { a->mask.dst.u.all & b->mask.dst.u.all },
905 a->mask.dst.protonum & b->mask.dst.protonum } };
907 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
910 static inline int expect_matches(const struct ip_conntrack_expect *a,
911 const struct ip_conntrack_expect *b)
913 return a->master == b->master
914 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
915 && ip_ct_tuple_equal(&a->mask, &b->mask);
918 /* Generally a bad idea to call this: could have matched already. */
919 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921 struct ip_conntrack_expect *i;
923 write_lock_bh(&ip_conntrack_lock);
924 /* choose the the oldest expectation to evict */
925 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
926 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
927 ip_ct_unlink_expect(i);
928 write_unlock_bh(&ip_conntrack_lock);
929 ip_conntrack_expect_put(i);
930 return;
933 write_unlock_bh(&ip_conntrack_lock);
936 /* We don't increase the master conntrack refcount for non-fulfilled
937 * conntracks. During the conntrack destruction, the expectations are
938 * always killed before the conntrack itself */
939 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
941 struct ip_conntrack_expect *new;
943 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
944 if (!new) {
945 DEBUGP("expect_related: OOM allocating expect\n");
946 return NULL;
948 new->master = me;
949 atomic_set(&new->use, 1);
950 return new;
953 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
955 if (atomic_dec_and_test(&exp->use))
956 kmem_cache_free(ip_conntrack_expect_cachep, exp);
959 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961 atomic_inc(&exp->use);
962 exp->master->expecting++;
963 list_add(&exp->list, &ip_conntrack_expect_list);
965 init_timer(&exp->timeout);
966 exp->timeout.data = (unsigned long)exp;
967 exp->timeout.function = expectation_timed_out;
968 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
969 add_timer(&exp->timeout);
971 exp->id = ++ip_conntrack_expect_next_id;
972 atomic_inc(&exp->use);
973 CONNTRACK_STAT_INC(expect_create);
976 /* Race with expectations being used means we could have none to find; OK. */
977 static void evict_oldest_expect(struct ip_conntrack *master)
979 struct ip_conntrack_expect *i;
981 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
982 if (i->master == master) {
983 if (del_timer(&i->timeout)) {
984 ip_ct_unlink_expect(i);
985 ip_conntrack_expect_put(i);
987 break;
992 static inline int refresh_timer(struct ip_conntrack_expect *i)
994 if (!del_timer(&i->timeout))
995 return 0;
997 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
998 add_timer(&i->timeout);
999 return 1;
1002 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004 struct ip_conntrack_expect *i;
1005 int ret;
1007 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1008 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1009 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1011 write_lock_bh(&ip_conntrack_lock);
1012 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1013 if (expect_matches(i, expect)) {
1014 /* Refresh timer: if it's dying, ignore.. */
1015 if (refresh_timer(i)) {
1016 ret = 0;
1017 goto out;
1019 } else if (expect_clash(i, expect)) {
1020 ret = -EBUSY;
1021 goto out;
1025 /* Will be over limit? */
1026 if (expect->master->helper->max_expected &&
1027 expect->master->expecting >= expect->master->helper->max_expected)
1028 evict_oldest_expect(expect->master);
1030 ip_conntrack_expect_insert(expect);
1031 ip_conntrack_expect_event(IPEXP_NEW, expect);
1032 ret = 0;
1033 out:
1034 write_unlock_bh(&ip_conntrack_lock);
1035 return ret;
1038 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1039 implicitly racy: see __ip_conntrack_confirm */
1040 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1041 const struct ip_conntrack_tuple *newreply)
1043 write_lock_bh(&ip_conntrack_lock);
1044 /* Should be unconfirmed, so not in hash table yet */
1045 IP_NF_ASSERT(!is_confirmed(conntrack));
1047 DEBUGP("Altering reply tuple of %p to ", conntrack);
1048 DUMP_TUPLE(newreply);
1050 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1051 if (!conntrack->master && conntrack->expecting == 0)
1052 conntrack->helper = __ip_conntrack_helper_find(newreply);
1053 write_unlock_bh(&ip_conntrack_lock);
1056 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058 BUG_ON(me->timeout == 0);
1059 write_lock_bh(&ip_conntrack_lock);
1060 list_add(&me->list, &helpers);
1061 write_unlock_bh(&ip_conntrack_lock);
1063 return 0;
1066 struct ip_conntrack_helper *
1067 __ip_conntrack_helper_find_byname(const char *name)
1069 struct ip_conntrack_helper *h;
1071 list_for_each_entry(h, &helpers, list) {
1072 if (!strcmp(h->name, name))
1073 return h;
1076 return NULL;
1079 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1080 const struct ip_conntrack_helper *me)
1082 if (tuplehash_to_ctrack(i)->helper == me) {
1083 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1084 tuplehash_to_ctrack(i)->helper = NULL;
1088 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1090 unsigned int i;
1091 struct ip_conntrack_tuple_hash *h;
1092 struct ip_conntrack_expect *exp, *tmp;
1094 /* Need write lock here, to delete helper. */
1095 write_lock_bh(&ip_conntrack_lock);
1096 list_del(&me->list);
1098 /* Get rid of expectations */
1099 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1100 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1101 ip_ct_unlink_expect(exp);
1102 ip_conntrack_expect_put(exp);
1105 /* Get rid of expecteds, set helpers to NULL. */
1106 list_for_each_entry(h, &unconfirmed, list)
1107 unhelp(h, me);
1108 for (i = 0; i < ip_conntrack_htable_size; i++) {
1109 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1110 unhelp(h, me);
1112 write_unlock_bh(&ip_conntrack_lock);
1114 /* Someone could be still looking at the helper in a bh. */
1115 synchronize_net();
1118 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1119 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1120 enum ip_conntrack_info ctinfo,
1121 const struct sk_buff *skb,
1122 unsigned long extra_jiffies,
1123 int do_acct)
1125 int event = 0;
1127 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1128 IP_NF_ASSERT(skb);
1130 write_lock_bh(&ip_conntrack_lock);
1132 /* Only update if this is not a fixed timeout */
1133 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1134 write_unlock_bh(&ip_conntrack_lock);
1135 return;
1138 /* If not in hash table, timer will not be active yet */
1139 if (!is_confirmed(ct)) {
1140 ct->timeout.expires = extra_jiffies;
1141 event = IPCT_REFRESH;
1142 } else {
1143 /* Need del_timer for race avoidance (may already be dying). */
1144 if (del_timer(&ct->timeout)) {
1145 ct->timeout.expires = jiffies + extra_jiffies;
1146 add_timer(&ct->timeout);
1147 event = IPCT_REFRESH;
1151 #ifdef CONFIG_IP_NF_CT_ACCT
1152 if (do_acct) {
1153 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1154 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1155 ntohs(skb->nh.iph->tot_len);
1156 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1157 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1158 event |= IPCT_COUNTER_FILLING;
1160 #endif
1162 write_unlock_bh(&ip_conntrack_lock);
1164 /* must be unlocked when calling event cache */
1165 if (event)
1166 ip_conntrack_event_cache(event, skb);
1169 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1170 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1171 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1172 * in ip_conntrack_core, since we don't want the protocols to autoload
1173 * or depend on ctnetlink */
1174 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1175 const struct ip_conntrack_tuple *tuple)
1177 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1178 &tuple->src.u.tcp.port);
1179 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1180 &tuple->dst.u.tcp.port);
1181 return 0;
1183 nfattr_failure:
1184 return -1;
1187 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1188 struct ip_conntrack_tuple *t)
1190 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1191 return -EINVAL;
1193 t->src.u.tcp.port =
1194 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1195 t->dst.u.tcp.port =
1196 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1198 return 0;
1200 #endif
1202 /* Returns new sk_buff, or NULL */
1203 struct sk_buff *
1204 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1206 skb_orphan(skb);
1208 local_bh_disable();
1209 skb = ip_defrag(skb, user);
1210 local_bh_enable();
1212 if (skb)
1213 ip_send_check(skb->nh.iph);
1214 return skb;
1217 /* Used by ipt_REJECT. */
1218 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1220 struct ip_conntrack *ct;
1221 enum ip_conntrack_info ctinfo;
1223 /* This ICMP is in reverse direction to the packet which caused it */
1224 ct = ip_conntrack_get(skb, &ctinfo);
1226 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1227 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1228 else
1229 ctinfo = IP_CT_RELATED;
1231 /* Attach to new skbuff, and increment count */
1232 nskb->nfct = &ct->ct_general;
1233 nskb->nfctinfo = ctinfo;
1234 nf_conntrack_get(nskb->nfct);
1237 /* Bring out ya dead! */
1238 static struct ip_conntrack *
1239 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1240 void *data, unsigned int *bucket)
1242 struct ip_conntrack_tuple_hash *h;
1243 struct ip_conntrack *ct;
1245 write_lock_bh(&ip_conntrack_lock);
1246 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1247 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1248 ct = tuplehash_to_ctrack(h);
1249 if (iter(ct, data))
1250 goto found;
1253 list_for_each_entry(h, &unconfirmed, list) {
1254 ct = tuplehash_to_ctrack(h);
1255 if (iter(ct, data))
1256 set_bit(IPS_DYING_BIT, &ct->status);
1258 write_unlock_bh(&ip_conntrack_lock);
1259 return NULL;
1261 found:
1262 atomic_inc(&ct->ct_general.use);
1263 write_unlock_bh(&ip_conntrack_lock);
1264 return ct;
1267 void
1268 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1270 struct ip_conntrack *ct;
1271 unsigned int bucket = 0;
1273 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1274 /* Time to push up daises... */
1275 if (del_timer(&ct->timeout))
1276 death_by_timeout((unsigned long)ct);
1277 /* ... else the timer will get him soon. */
1279 ip_conntrack_put(ct);
1283 /* Fast function for those who don't want to parse /proc (and I don't
1284 blame them). */
1285 /* Reversing the socket's dst/src point of view gives us the reply
1286 mapping. */
1287 static int
1288 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1290 struct inet_sock *inet = inet_sk(sk);
1291 struct ip_conntrack_tuple_hash *h;
1292 struct ip_conntrack_tuple tuple;
1294 IP_CT_TUPLE_U_BLANK(&tuple);
1295 tuple.src.ip = inet->rcv_saddr;
1296 tuple.src.u.tcp.port = inet->sport;
1297 tuple.dst.ip = inet->daddr;
1298 tuple.dst.u.tcp.port = inet->dport;
1299 tuple.dst.protonum = IPPROTO_TCP;
1301 /* We only do TCP at the moment: is there a better way? */
1302 if (strcmp(sk->sk_prot->name, "TCP")) {
1303 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1304 return -ENOPROTOOPT;
1307 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1308 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1309 *len, sizeof(struct sockaddr_in));
1310 return -EINVAL;
1313 h = ip_conntrack_find_get(&tuple, NULL);
1314 if (h) {
1315 struct sockaddr_in sin;
1316 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1318 sin.sin_family = AF_INET;
1319 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1320 .tuple.dst.u.tcp.port;
1321 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1322 .tuple.dst.ip;
1323 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1325 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1326 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1327 ip_conntrack_put(ct);
1328 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1329 return -EFAULT;
1330 else
1331 return 0;
1333 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1334 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1335 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1336 return -ENOENT;
1339 static struct nf_sockopt_ops so_getorigdst = {
1340 .pf = PF_INET,
1341 .get_optmin = SO_ORIGINAL_DST,
1342 .get_optmax = SO_ORIGINAL_DST+1,
1343 .get = &getorigdst,
1346 static int kill_all(struct ip_conntrack *i, void *data)
1348 return 1;
1351 void ip_conntrack_flush(void)
1353 ip_ct_iterate_cleanup(kill_all, NULL);
1356 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1358 if (vmalloced)
1359 vfree(hash);
1360 else
1361 free_pages((unsigned long)hash,
1362 get_order(sizeof(struct list_head) * size));
1365 /* Mishearing the voices in his head, our hero wonders how he's
1366 supposed to kill the mall. */
1367 void ip_conntrack_cleanup(void)
1369 rcu_assign_pointer(ip_ct_attach, NULL);
1371 /* This makes sure all current packets have passed through
1372 netfilter framework. Roll on, two-stage module
1373 delete... */
1374 synchronize_net();
1376 ip_ct_event_cache_flush();
1377 i_see_dead_people:
1378 ip_conntrack_flush();
1379 if (atomic_read(&ip_conntrack_count) != 0) {
1380 schedule();
1381 goto i_see_dead_people;
1383 /* wait until all references to ip_conntrack_untracked are dropped */
1384 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1385 schedule();
1387 kmem_cache_destroy(ip_conntrack_cachep);
1388 kmem_cache_destroy(ip_conntrack_expect_cachep);
1389 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1390 ip_conntrack_htable_size);
1391 nf_unregister_sockopt(&so_getorigdst);
1394 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1396 struct list_head *hash;
1397 unsigned int i;
1399 *vmalloced = 0;
1400 hash = (void*)__get_free_pages(GFP_KERNEL,
1401 get_order(sizeof(struct list_head)
1402 * size));
1403 if (!hash) {
1404 *vmalloced = 1;
1405 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1406 hash = vmalloc(sizeof(struct list_head) * size);
1409 if (hash)
1410 for (i = 0; i < size; i++)
1411 INIT_LIST_HEAD(&hash[i]);
1413 return hash;
1416 static int set_hashsize(const char *val, struct kernel_param *kp)
1418 int i, bucket, hashsize, vmalloced;
1419 int old_vmalloced, old_size;
1420 int rnd;
1421 struct list_head *hash, *old_hash;
1422 struct ip_conntrack_tuple_hash *h;
1424 /* On boot, we can set this without any fancy locking. */
1425 if (!ip_conntrack_htable_size)
1426 return param_set_int(val, kp);
1428 hashsize = simple_strtol(val, NULL, 0);
1429 if (!hashsize)
1430 return -EINVAL;
1432 hash = alloc_hashtable(hashsize, &vmalloced);
1433 if (!hash)
1434 return -ENOMEM;
1436 /* We have to rehash for the new table anyway, so we also can
1437 * use a new random seed */
1438 get_random_bytes(&rnd, 4);
1440 write_lock_bh(&ip_conntrack_lock);
1441 for (i = 0; i < ip_conntrack_htable_size; i++) {
1442 while (!list_empty(&ip_conntrack_hash[i])) {
1443 h = list_entry(ip_conntrack_hash[i].next,
1444 struct ip_conntrack_tuple_hash, list);
1445 list_del(&h->list);
1446 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1447 list_add_tail(&h->list, &hash[bucket]);
1450 old_size = ip_conntrack_htable_size;
1451 old_vmalloced = ip_conntrack_vmalloc;
1452 old_hash = ip_conntrack_hash;
1454 ip_conntrack_htable_size = hashsize;
1455 ip_conntrack_vmalloc = vmalloced;
1456 ip_conntrack_hash = hash;
1457 ip_conntrack_hash_rnd = rnd;
1458 write_unlock_bh(&ip_conntrack_lock);
1460 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1461 return 0;
1464 module_param_call(hashsize, set_hashsize, param_get_uint,
1465 &ip_conntrack_htable_size, 0600);
1467 int __init ip_conntrack_init(void)
1469 unsigned int i;
1470 int ret;
1472 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1473 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1474 if (!ip_conntrack_htable_size) {
1475 ip_conntrack_htable_size
1476 = (((num_physpages << PAGE_SHIFT) / 16384)
1477 / sizeof(struct list_head));
1478 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1479 ip_conntrack_htable_size = 8192;
1480 if (ip_conntrack_htable_size < 16)
1481 ip_conntrack_htable_size = 16;
1483 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1485 printk("ip_conntrack version %s (%u buckets, %d max)"
1486 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1487 ip_conntrack_htable_size, ip_conntrack_max,
1488 sizeof(struct ip_conntrack));
1490 ret = nf_register_sockopt(&so_getorigdst);
1491 if (ret != 0) {
1492 printk(KERN_ERR "Unable to register netfilter socket option\n");
1493 return ret;
1496 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1497 &ip_conntrack_vmalloc);
1498 if (!ip_conntrack_hash) {
1499 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1500 goto err_unreg_sockopt;
1503 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1504 sizeof(struct ip_conntrack), 0,
1505 0, NULL, NULL);
1506 if (!ip_conntrack_cachep) {
1507 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1508 goto err_free_hash;
1511 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1512 sizeof(struct ip_conntrack_expect),
1513 0, 0, NULL, NULL);
1514 if (!ip_conntrack_expect_cachep) {
1515 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1516 goto err_free_conntrack_slab;
1519 /* Don't NEED lock here, but good form anyway. */
1520 write_lock_bh(&ip_conntrack_lock);
1521 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1522 rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol);
1523 /* Sew in builtin protocols. */
1524 rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp);
1525 rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp);
1526 rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp);
1527 write_unlock_bh(&ip_conntrack_lock);
1529 /* For use by ipt_REJECT */
1530 rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach);
1532 /* Set up fake conntrack:
1533 - to never be deleted, not in any hashes */
1534 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1535 /* - and look it like as a confirmed connection */
1536 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1538 return ret;
1540 err_free_conntrack_slab:
1541 kmem_cache_destroy(ip_conntrack_cachep);
1542 err_free_hash:
1543 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1544 ip_conntrack_htable_size);
1545 err_unreg_sockopt:
1546 nf_unregister_sockopt(&so_getorigdst);
1548 return -ENOMEM;