1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
20 #include <linux/types.h>
21 #include <linux/icmp.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #define IP_CONNTRACK_VERSION "2.4"
56 #define DEBUGP(format, args...)
59 DEFINE_RWLOCK(ip_conntrack_lock
);
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count
= ATOMIC_INIT(0);
64 void (*ip_conntrack_destroyed
)(struct ip_conntrack
*conntrack
) = NULL
;
65 LIST_HEAD(ip_conntrack_expect_list
);
66 struct ip_conntrack_protocol
*ip_ct_protos
[MAX_IP_CT_PROTO
] __read_mostly
;
67 static LIST_HEAD(helpers
);
68 unsigned int ip_conntrack_htable_size __read_mostly
= 0;
69 int ip_conntrack_max __read_mostly
;
70 struct list_head
*ip_conntrack_hash __read_mostly
;
71 static kmem_cache_t
*ip_conntrack_cachep __read_mostly
;
72 static kmem_cache_t
*ip_conntrack_expect_cachep __read_mostly
;
73 struct ip_conntrack ip_conntrack_untracked
;
74 unsigned int ip_ct_log_invalid __read_mostly
;
75 static LIST_HEAD(unconfirmed
);
76 static int ip_conntrack_vmalloc __read_mostly
;
78 static unsigned int ip_conntrack_next_id
;
79 static unsigned int ip_conntrack_expect_next_id
;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain
);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain
);
84 DEFINE_PER_CPU(struct ip_conntrack_ecache
, ip_conntrack_ecache
);
86 /* deliver cached events and clear cache entry - must be called with locally
87 * disabled softirqs */
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache
*ecache
)
91 DEBUGP("ecache: delivering events for %p\n", ecache
->ct
);
92 if (is_confirmed(ecache
->ct
) && !is_dying(ecache
->ct
) && ecache
->events
)
93 atomic_notifier_call_chain(&ip_conntrack_chain
, ecache
->events
,
96 ip_conntrack_put(ecache
->ct
);
100 /* Deliver all cached events for a particular conntrack. This is called
101 * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack
*ct
)
104 struct ip_conntrack_ecache
*ecache
;
107 ecache
= &__get_cpu_var(ip_conntrack_ecache
);
108 if (ecache
->ct
== ct
)
109 __ip_ct_deliver_cached_events(ecache
);
113 void __ip_ct_event_cache_init(struct ip_conntrack
*ct
)
115 struct ip_conntrack_ecache
*ecache
;
117 /* take care of delivering potentially old events */
118 ecache
= &__get_cpu_var(ip_conntrack_ecache
);
119 BUG_ON(ecache
->ct
== ct
);
121 __ip_ct_deliver_cached_events(ecache
);
122 /* initialize for this conntrack/packet */
124 nf_conntrack_get(&ct
->ct_general
);
127 /* flush the event cache - touches other CPU's data and must not be called while
128 * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
131 struct ip_conntrack_ecache
*ecache
;
134 for_each_possible_cpu(cpu
) {
135 ecache
= &per_cpu(ip_conntrack_ecache
, cpu
);
137 ip_conntrack_put(ecache
->ct
);
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
144 DEFINE_PER_CPU(struct ip_conntrack_stat
, ip_conntrack_stat
);
146 static int ip_conntrack_hash_rnd_initted
;
147 static unsigned int ip_conntrack_hash_rnd
;
149 static u_int32_t
__hash_conntrack(const struct ip_conntrack_tuple
*tuple
,
150 unsigned int size
, unsigned int rnd
)
152 return (jhash_3words((__force u32
)tuple
->src
.ip
,
153 ((__force u32
)tuple
->dst
.ip
^ tuple
->dst
.protonum
),
154 (tuple
->src
.u
.all
| (tuple
->dst
.u
.all
<< 16)),
159 hash_conntrack(const struct ip_conntrack_tuple
*tuple
)
161 return __hash_conntrack(tuple
, ip_conntrack_htable_size
,
162 ip_conntrack_hash_rnd
);
166 ip_ct_get_tuple(const struct iphdr
*iph
,
167 const struct sk_buff
*skb
,
168 unsigned int dataoff
,
169 struct ip_conntrack_tuple
*tuple
,
170 const struct ip_conntrack_protocol
*protocol
)
173 if (iph
->frag_off
& htons(IP_OFFSET
)) {
174 printk("ip_conntrack_core: Frag of proto %u.\n",
179 tuple
->src
.ip
= iph
->saddr
;
180 tuple
->dst
.ip
= iph
->daddr
;
181 tuple
->dst
.protonum
= iph
->protocol
;
182 tuple
->dst
.dir
= IP_CT_DIR_ORIGINAL
;
184 return protocol
->pkt_to_tuple(skb
, dataoff
, tuple
);
188 ip_ct_invert_tuple(struct ip_conntrack_tuple
*inverse
,
189 const struct ip_conntrack_tuple
*orig
,
190 const struct ip_conntrack_protocol
*protocol
)
192 inverse
->src
.ip
= orig
->dst
.ip
;
193 inverse
->dst
.ip
= orig
->src
.ip
;
194 inverse
->dst
.protonum
= orig
->dst
.protonum
;
195 inverse
->dst
.dir
= !orig
->dst
.dir
;
197 return protocol
->invert_tuple(inverse
, orig
);
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect
*exp
)
204 ASSERT_WRITE_LOCK(&ip_conntrack_lock
);
205 IP_NF_ASSERT(!timer_pending(&exp
->timeout
));
206 list_del(&exp
->list
);
207 CONNTRACK_STAT_INC(expect_delete
);
208 exp
->master
->expecting
--;
209 ip_conntrack_expect_put(exp
);
212 static void expectation_timed_out(unsigned long ul_expect
)
214 struct ip_conntrack_expect
*exp
= (void *)ul_expect
;
216 write_lock_bh(&ip_conntrack_lock
);
217 ip_ct_unlink_expect(exp
);
218 write_unlock_bh(&ip_conntrack_lock
);
219 ip_conntrack_expect_put(exp
);
222 struct ip_conntrack_expect
*
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple
*tuple
)
225 struct ip_conntrack_expect
*i
;
227 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
228 if (ip_ct_tuple_mask_cmp(tuple
, &i
->tuple
, &i
->mask
))
234 /* Just find a expectation corresponding to a tuple. */
235 struct ip_conntrack_expect
*
236 ip_conntrack_expect_find(const struct ip_conntrack_tuple
*tuple
)
238 struct ip_conntrack_expect
*i
;
240 read_lock_bh(&ip_conntrack_lock
);
241 i
= __ip_conntrack_expect_find(tuple
);
244 read_unlock_bh(&ip_conntrack_lock
);
249 /* If an expectation for this connection is found, it gets delete from
250 * global list then returned. */
251 static struct ip_conntrack_expect
*
252 find_expectation(const struct ip_conntrack_tuple
*tuple
)
254 struct ip_conntrack_expect
*i
;
256 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
257 /* If master is not in hash table yet (ie. packet hasn't left
258 this machine yet), how can other end know about expected?
259 Hence these are not the droids you are looking for (if
260 master ct never got confirmed, we'd hold a reference to it
261 and weird things would happen to future packets). */
262 if (ip_ct_tuple_mask_cmp(tuple
, &i
->tuple
, &i
->mask
)
263 && is_confirmed(i
->master
)) {
264 if (i
->flags
& IP_CT_EXPECT_PERMANENT
) {
267 } else if (del_timer(&i
->timeout
)) {
268 ip_ct_unlink_expect(i
);
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack
*ct
)
279 struct ip_conntrack_expect
*i
, *tmp
;
281 /* Optimization: most connection never expect any others. */
282 if (ct
->expecting
== 0)
285 list_for_each_entry_safe(i
, tmp
, &ip_conntrack_expect_list
, list
) {
286 if (i
->master
== ct
&& del_timer(&i
->timeout
)) {
287 ip_ct_unlink_expect(i
);
288 ip_conntrack_expect_put(i
);
294 clean_from_lists(struct ip_conntrack
*ct
)
296 DEBUGP("clean_from_lists(%p)\n", ct
);
297 ASSERT_WRITE_LOCK(&ip_conntrack_lock
);
298 list_del(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
299 list_del(&ct
->tuplehash
[IP_CT_DIR_REPLY
].list
);
301 /* Destroy all pending expectations */
302 ip_ct_remove_expectations(ct
);
306 destroy_conntrack(struct nf_conntrack
*nfct
)
308 struct ip_conntrack
*ct
= (struct ip_conntrack
*)nfct
;
309 struct ip_conntrack_protocol
*proto
;
310 struct ip_conntrack_helper
*helper
;
312 DEBUGP("destroy_conntrack(%p)\n", ct
);
313 IP_NF_ASSERT(atomic_read(&nfct
->use
) == 0);
314 IP_NF_ASSERT(!timer_pending(&ct
->timeout
));
316 ip_conntrack_event(IPCT_DESTROY
, ct
);
317 set_bit(IPS_DYING_BIT
, &ct
->status
);
320 if (helper
&& helper
->destroy
)
323 /* To make sure we don't get any weird locking issues here:
324 * destroy_conntrack() MUST NOT be called with a write lock
325 * to ip_conntrack_lock!!! -HW */
326 proto
= __ip_conntrack_proto_find(ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.protonum
);
327 if (proto
&& proto
->destroy
)
330 if (ip_conntrack_destroyed
)
331 ip_conntrack_destroyed(ct
);
333 write_lock_bh(&ip_conntrack_lock
);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
338 ip_ct_remove_expectations(ct
);
340 #if defined(CONFIG_IP_NF_MATCH_LAYER7) || defined(CONFIG_IP_NF_MATCH_LAYER7_MODULE)
341 if(ct
->layer7
.app_proto
)
342 kfree(ct
->layer7
.app_proto
);
343 if(ct
->layer7
.app_data
)
344 kfree(ct
->layer7
.app_data
);
347 /* We overload first tuple to link into unconfirmed list. */
348 if (!is_confirmed(ct
)) {
349 BUG_ON(list_empty(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
));
350 list_del(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
353 CONNTRACK_STAT_INC(delete);
354 write_unlock_bh(&ip_conntrack_lock
);
357 ip_conntrack_put(ct
->master
);
359 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct
);
360 ip_conntrack_free(ct
);
363 static void death_by_timeout(unsigned long ul_conntrack
)
365 struct ip_conntrack
*ct
= (void *)ul_conntrack
;
367 write_lock_bh(&ip_conntrack_lock
);
368 /* Inside lock so preempt is disabled on module removal path.
369 * Otherwise we can get spurious warnings. */
370 CONNTRACK_STAT_INC(delete_list
);
371 clean_from_lists(ct
);
372 write_unlock_bh(&ip_conntrack_lock
);
373 ip_conntrack_put(ct
);
376 struct ip_conntrack_tuple_hash
*
377 __ip_conntrack_find(const struct ip_conntrack_tuple
*tuple
,
378 const struct ip_conntrack
*ignored_conntrack
)
380 struct ip_conntrack_tuple_hash
*h
;
381 unsigned int hash
= hash_conntrack(tuple
);
383 ASSERT_READ_LOCK(&ip_conntrack_lock
);
384 list_for_each_entry(h
, &ip_conntrack_hash
[hash
], list
) {
385 if (tuplehash_to_ctrack(h
) != ignored_conntrack
&&
386 ip_ct_tuple_equal(tuple
, &h
->tuple
)) {
387 CONNTRACK_STAT_INC(found
);
390 CONNTRACK_STAT_INC(searched
);
396 /* Find a connection corresponding to a tuple. */
397 struct ip_conntrack_tuple_hash
*
398 ip_conntrack_find_get(const struct ip_conntrack_tuple
*tuple
,
399 const struct ip_conntrack
*ignored_conntrack
)
401 struct ip_conntrack_tuple_hash
*h
;
403 read_lock_bh(&ip_conntrack_lock
);
404 h
= __ip_conntrack_find(tuple
, ignored_conntrack
);
406 atomic_inc(&tuplehash_to_ctrack(h
)->ct_general
.use
);
407 read_unlock_bh(&ip_conntrack_lock
);
412 static void __ip_conntrack_hash_insert(struct ip_conntrack
*ct
,
414 unsigned int repl_hash
)
416 ct
->id
= ++ip_conntrack_next_id
;
417 list_add(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
,
418 &ip_conntrack_hash
[hash
]);
419 list_add(&ct
->tuplehash
[IP_CT_DIR_REPLY
].list
,
420 &ip_conntrack_hash
[repl_hash
]);
423 void ip_conntrack_hash_insert(struct ip_conntrack
*ct
)
425 unsigned int hash
, repl_hash
;
427 hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
428 repl_hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
430 write_lock_bh(&ip_conntrack_lock
);
431 __ip_conntrack_hash_insert(ct
, hash
, repl_hash
);
432 write_unlock_bh(&ip_conntrack_lock
);
435 /* Confirm a connection given skb; places it in hash table */
437 __ip_conntrack_confirm(struct sk_buff
**pskb
)
439 unsigned int hash
, repl_hash
;
440 struct ip_conntrack_tuple_hash
*h
;
441 struct ip_conntrack
*ct
;
442 enum ip_conntrack_info ctinfo
;
444 ct
= ip_conntrack_get(*pskb
, &ctinfo
);
446 /* ipt_REJECT uses ip_conntrack_attach to attach related
447 ICMP/TCP RST packets in other direction. Actual packet
448 which created connection will be IP_CT_NEW or for an
449 expected connection, IP_CT_RELATED. */
450 if (CTINFO2DIR(ctinfo
) != IP_CT_DIR_ORIGINAL
)
453 hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
454 repl_hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
456 /* We're not in hash table, and we refuse to set up related
457 connections for unconfirmed conns. But packet copies and
458 REJECT will give spurious warnings here. */
459 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
461 /* No external references means noone else could have
463 IP_NF_ASSERT(!is_confirmed(ct
));
464 DEBUGP("Confirming conntrack %p\n", ct
);
466 write_lock_bh(&ip_conntrack_lock
);
468 /* See if there's one in the list already, including reverse:
469 NAT could have grabbed it without realizing, since we're
470 not in the hash. If there is, we lost race. */
471 list_for_each_entry(h
, &ip_conntrack_hash
[hash
], list
)
472 if (ip_ct_tuple_equal(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
,
475 list_for_each_entry(h
, &ip_conntrack_hash
[repl_hash
], list
)
476 if (ip_ct_tuple_equal(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
,
480 /* Remove from unconfirmed list */
481 list_del(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
483 __ip_conntrack_hash_insert(ct
, hash
, repl_hash
);
484 /* Timer relative to confirmation time, not original
485 setting time, otherwise we'd get timer wrap in
486 weird delay cases. */
487 ct
->timeout
.expires
+= jiffies
;
488 add_timer(&ct
->timeout
);
489 atomic_inc(&ct
->ct_general
.use
);
490 set_bit(IPS_CONFIRMED_BIT
, &ct
->status
);
491 CONNTRACK_STAT_INC(insert
);
492 write_unlock_bh(&ip_conntrack_lock
);
494 ip_conntrack_event_cache(IPCT_HELPER
, *pskb
);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT
, &ct
->status
) ||
497 test_bit(IPS_DST_NAT_DONE_BIT
, &ct
->status
))
498 ip_conntrack_event_cache(IPCT_NATINFO
, *pskb
);
500 ip_conntrack_event_cache(master_ct(ct
) ?
501 IPCT_RELATED
: IPCT_NEW
, *pskb
);
506 CONNTRACK_STAT_INC(insert_failed
);
507 write_unlock_bh(&ip_conntrack_lock
);
511 /* Returns true if a connection correspondings to the tuple (required
514 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple
*tuple
,
515 const struct ip_conntrack
*ignored_conntrack
)
517 struct ip_conntrack_tuple_hash
*h
;
519 read_lock_bh(&ip_conntrack_lock
);
520 h
= __ip_conntrack_find(tuple
, ignored_conntrack
);
521 read_unlock_bh(&ip_conntrack_lock
);
526 /* There's a small race here where we may free a just-assured
527 connection. Too bad: we're in trouble anyway. */
528 static int early_drop(struct list_head
*chain
)
530 /* Traverse backwards: gives us oldest, which is roughly LRU */
531 struct ip_conntrack_tuple_hash
*h
;
532 struct ip_conntrack
*ct
= NULL
, *tmp
;
535 read_lock_bh(&ip_conntrack_lock
);
536 list_for_each_entry_reverse(h
, chain
, list
) {
537 tmp
= tuplehash_to_ctrack(h
);
538 if (!test_bit(IPS_ASSURED_BIT
, &tmp
->status
)) {
540 atomic_inc(&ct
->ct_general
.use
);
544 read_unlock_bh(&ip_conntrack_lock
);
549 if (del_timer(&ct
->timeout
)) {
550 death_by_timeout((unsigned long)ct
);
552 CONNTRACK_STAT_INC(early_drop
);
554 ip_conntrack_put(ct
);
558 static struct ip_conntrack_helper
*
559 __ip_conntrack_helper_find( const struct ip_conntrack_tuple
*tuple
)
561 struct ip_conntrack_helper
*h
;
563 list_for_each_entry(h
, &helpers
, list
) {
564 if (ip_ct_tuple_mask_cmp(tuple
, &h
->tuple
, &h
->mask
))
570 struct ip_conntrack_helper
*
571 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple
*tuple
)
573 struct ip_conntrack_helper
*helper
;
575 /* need ip_conntrack_lock to assure that helper exists until
576 * try_module_get() is called */
577 read_lock_bh(&ip_conntrack_lock
);
579 helper
= __ip_conntrack_helper_find(tuple
);
581 /* need to increase module usage count to assure helper will
582 * not go away while the caller is e.g. busy putting a
583 * conntrack in the hash that uses the helper */
584 if (!try_module_get(helper
->me
))
588 read_unlock_bh(&ip_conntrack_lock
);
593 void ip_conntrack_helper_put(struct ip_conntrack_helper
*helper
)
595 module_put(helper
->me
);
598 struct ip_conntrack_protocol
*
599 __ip_conntrack_proto_find(u_int8_t protocol
)
601 return ip_ct_protos
[protocol
];
604 /* this is guaranteed to always return a valid protocol helper, since
605 * it falls back to generic_protocol */
606 struct ip_conntrack_protocol
*
607 ip_conntrack_proto_find_get(u_int8_t protocol
)
609 struct ip_conntrack_protocol
*p
;
612 p
= __ip_conntrack_proto_find(protocol
);
614 if (!try_module_get(p
->me
))
615 p
= &ip_conntrack_generic_protocol
;
622 void ip_conntrack_proto_put(struct ip_conntrack_protocol
*p
)
627 struct ip_conntrack
*ip_conntrack_alloc(struct ip_conntrack_tuple
*orig
,
628 struct ip_conntrack_tuple
*repl
)
630 struct ip_conntrack
*conntrack
;
632 if (!ip_conntrack_hash_rnd_initted
) {
633 get_random_bytes(&ip_conntrack_hash_rnd
, 4);
634 ip_conntrack_hash_rnd_initted
= 1;
637 /* We don't want any race condition at early drop stage */
638 atomic_inc(&ip_conntrack_count
);
641 && atomic_read(&ip_conntrack_count
) > ip_conntrack_max
) {
642 unsigned int hash
= hash_conntrack(orig
);
643 /* Try dropping from this hash chain. */
644 if (!early_drop(&ip_conntrack_hash
[hash
])) {
645 atomic_dec(&ip_conntrack_count
);
648 "ip_conntrack: table full, dropping"
650 return ERR_PTR(-ENOMEM
);
654 conntrack
= kmem_cache_alloc(ip_conntrack_cachep
, GFP_ATOMIC
);
656 DEBUGP("Can't allocate conntrack.\n");
657 atomic_dec(&ip_conntrack_count
);
658 return ERR_PTR(-ENOMEM
);
661 memset(conntrack
, 0, sizeof(*conntrack
));
662 atomic_set(&conntrack
->ct_general
.use
, 1);
663 conntrack
->ct_general
.destroy
= destroy_conntrack
;
664 conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
= *orig
;
665 conntrack
->tuplehash
[IP_CT_DIR_REPLY
].tuple
= *repl
;
666 /* Don't set timer yet: wait for confirmation */
667 init_timer(&conntrack
->timeout
);
668 conntrack
->timeout
.data
= (unsigned long)conntrack
;
669 conntrack
->timeout
.function
= death_by_timeout
;
675 ip_conntrack_free(struct ip_conntrack
*conntrack
)
677 atomic_dec(&ip_conntrack_count
);
678 kmem_cache_free(ip_conntrack_cachep
, conntrack
);
681 /* Allocate a new conntrack: we return -ENOMEM if classification
682 * failed due to stress. Otherwise it really is unclassifiable */
683 static struct ip_conntrack_tuple_hash
*
684 init_conntrack(struct ip_conntrack_tuple
*tuple
,
685 struct ip_conntrack_protocol
*protocol
,
688 struct ip_conntrack
*conntrack
;
689 struct ip_conntrack_tuple repl_tuple
;
690 struct ip_conntrack_expect
*exp
;
692 if (!ip_ct_invert_tuple(&repl_tuple
, tuple
, protocol
)) {
693 DEBUGP("Can't invert tuple.\n");
697 conntrack
= ip_conntrack_alloc(tuple
, &repl_tuple
);
698 if (conntrack
== NULL
|| IS_ERR(conntrack
))
699 return (struct ip_conntrack_tuple_hash
*)conntrack
;
701 if (!protocol
->new(conntrack
, skb
)) {
702 ip_conntrack_free(conntrack
);
706 write_lock_bh(&ip_conntrack_lock
);
707 exp
= find_expectation(tuple
);
710 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
712 /* Welcome, Mr. Bond. We've been expecting you... */
713 __set_bit(IPS_EXPECTED_BIT
, &conntrack
->status
);
714 conntrack
->master
= exp
->master
;
715 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
716 conntrack
->mark
= exp
->master
->mark
;
718 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720 /* this is ugly, but there is no other place where to put it */
721 conntrack
->nat
.masq_index
= exp
->master
->nat
.masq_index
;
723 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
724 conntrack
->secmark
= exp
->master
->secmark
;
726 nf_conntrack_get(&conntrack
->master
->ct_general
);
727 CONNTRACK_STAT_INC(expect_new
);
729 conntrack
->helper
= __ip_conntrack_helper_find(&repl_tuple
);
731 CONNTRACK_STAT_INC(new);
734 /* Overload tuple linked list to put us in unconfirmed list. */
735 list_add(&conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
, &unconfirmed
);
737 write_unlock_bh(&ip_conntrack_lock
);
741 exp
->expectfn(conntrack
, exp
);
742 ip_conntrack_expect_put(exp
);
745 return &conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
];
748 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
749 static inline struct ip_conntrack
*
750 resolve_normal_ct(struct sk_buff
*skb
,
751 struct ip_conntrack_protocol
*proto
,
753 unsigned int hooknum
,
754 enum ip_conntrack_info
*ctinfo
)
756 struct ip_conntrack_tuple tuple
;
757 struct ip_conntrack_tuple_hash
*h
;
758 struct ip_conntrack
*ct
;
760 IP_NF_ASSERT((skb
->nh
.iph
->frag_off
& htons(IP_OFFSET
)) == 0);
762 if (!ip_ct_get_tuple(skb
->nh
.iph
, skb
, skb
->nh
.iph
->ihl
*4,
766 /* look for tuple match */
767 h
= ip_conntrack_find_get(&tuple
, NULL
);
769 h
= init_conntrack(&tuple
, proto
, skb
);
775 ct
= tuplehash_to_ctrack(h
);
777 /* It exists; we have (non-exclusive) reference. */
778 if (DIRECTION(h
) == IP_CT_DIR_REPLY
) {
779 *ctinfo
= IP_CT_ESTABLISHED
+ IP_CT_IS_REPLY
;
780 /* Please set reply bit if this packet OK */
783 /* Once we've had two way comms, always ESTABLISHED. */
784 if (test_bit(IPS_SEEN_REPLY_BIT
, &ct
->status
)) {
785 DEBUGP("ip_conntrack_in: normal packet for %p\n",
787 *ctinfo
= IP_CT_ESTABLISHED
;
788 } else if (test_bit(IPS_EXPECTED_BIT
, &ct
->status
)) {
789 DEBUGP("ip_conntrack_in: related packet for %p\n",
791 *ctinfo
= IP_CT_RELATED
;
793 DEBUGP("ip_conntrack_in: new packet for %p\n",
799 skb
->nfct
= &ct
->ct_general
;
800 skb
->nfctinfo
= *ctinfo
;
804 /* Netfilter hook itself. */
805 unsigned int ip_conntrack_in(unsigned int hooknum
,
806 struct sk_buff
**pskb
,
807 const struct net_device
*in
,
808 const struct net_device
*out
,
809 int (*okfn
)(struct sk_buff
*))
811 struct ip_conntrack
*ct
;
812 enum ip_conntrack_info ctinfo
;
813 struct ip_conntrack_protocol
*proto
;
817 /* Previously seen (loopback or untracked)? Ignore. */
819 CONNTRACK_STAT_INC(ignore
);
824 if ((*pskb
)->nh
.iph
->frag_off
& htons(IP_OFFSET
)) {
825 if (net_ratelimit()) {
826 printk(KERN_ERR
"ip_conntrack_in: Frag of proto %u (hook=%u)\n",
827 (*pskb
)->nh
.iph
->protocol
, hooknum
);
832 /* Doesn't cover locally-generated broadcast, so not worth it. */
834 /* Ignore broadcast: no `connection'. */
835 if ((*pskb
)->pkt_type
== PACKET_BROADCAST
) {
836 printk("Broadcast packet!\n");
838 } else if (((*pskb
)->nh
.iph
->daddr
& htonl(0x000000FF))
839 == htonl(0x000000FF)) {
840 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
841 NIPQUAD((*pskb
)->nh
.iph
->saddr
),
842 NIPQUAD((*pskb
)->nh
.iph
->daddr
),
843 (*pskb
)->sk
, (*pskb
)->pkt_type
);
847 proto
= __ip_conntrack_proto_find((*pskb
)->nh
.iph
->protocol
);
849 /* It may be an special packet, error, unclean...
850 * inverse of the return code tells to the netfilter
851 * core what to do with the packet. */
852 if (proto
->error
!= NULL
853 && (ret
= proto
->error(*pskb
, &ctinfo
, hooknum
)) <= 0) {
854 CONNTRACK_STAT_INC(error
);
855 CONNTRACK_STAT_INC(invalid
);
859 if (!(ct
= resolve_normal_ct(*pskb
, proto
,&set_reply
,hooknum
,&ctinfo
))) {
860 /* Not valid part of a connection */
861 CONNTRACK_STAT_INC(invalid
);
866 /* Too stressed to deal. */
867 CONNTRACK_STAT_INC(drop
);
871 IP_NF_ASSERT((*pskb
)->nfct
);
873 ret
= proto
->packet(ct
, *pskb
, ctinfo
);
875 /* Invalid: inverse of the return code tells
876 * the netfilter core what to do*/
877 nf_conntrack_put((*pskb
)->nfct
);
878 (*pskb
)->nfct
= NULL
;
879 CONNTRACK_STAT_INC(invalid
);
883 if (set_reply
&& !test_and_set_bit(IPS_SEEN_REPLY_BIT
, &ct
->status
))
884 ip_conntrack_event_cache(IPCT_STATUS
, *pskb
);
889 int invert_tuplepr(struct ip_conntrack_tuple
*inverse
,
890 const struct ip_conntrack_tuple
*orig
)
892 return ip_ct_invert_tuple(inverse
, orig
,
893 __ip_conntrack_proto_find(orig
->dst
.protonum
));
896 /* Would two expected things clash? */
897 static inline int expect_clash(const struct ip_conntrack_expect
*a
,
898 const struct ip_conntrack_expect
*b
)
900 /* Part covered by intersection of masks must be unequal,
901 otherwise they clash */
902 struct ip_conntrack_tuple intersect_mask
903 = { { a
->mask
.src
.ip
& b
->mask
.src
.ip
,
904 { a
->mask
.src
.u
.all
& b
->mask
.src
.u
.all
} },
905 { a
->mask
.dst
.ip
& b
->mask
.dst
.ip
,
906 { a
->mask
.dst
.u
.all
& b
->mask
.dst
.u
.all
},
907 a
->mask
.dst
.protonum
& b
->mask
.dst
.protonum
} };
909 return ip_ct_tuple_mask_cmp(&a
->tuple
, &b
->tuple
, &intersect_mask
);
912 static inline int expect_matches(const struct ip_conntrack_expect
*a
,
913 const struct ip_conntrack_expect
*b
)
915 return a
->master
== b
->master
916 && ip_ct_tuple_equal(&a
->tuple
, &b
->tuple
)
917 && ip_ct_tuple_equal(&a
->mask
, &b
->mask
);
920 /* Generally a bad idea to call this: could have matched already. */
921 void ip_conntrack_unexpect_related(struct ip_conntrack_expect
*exp
)
923 struct ip_conntrack_expect
*i
;
925 write_lock_bh(&ip_conntrack_lock
);
926 /* choose the the oldest expectation to evict */
927 list_for_each_entry_reverse(i
, &ip_conntrack_expect_list
, list
) {
928 if (expect_matches(i
, exp
) && del_timer(&i
->timeout
)) {
929 ip_ct_unlink_expect(i
);
930 write_unlock_bh(&ip_conntrack_lock
);
931 ip_conntrack_expect_put(i
);
935 write_unlock_bh(&ip_conntrack_lock
);
938 /* We don't increase the master conntrack refcount for non-fulfilled
939 * conntracks. During the conntrack destruction, the expectations are
940 * always killed before the conntrack itself */
941 struct ip_conntrack_expect
*ip_conntrack_expect_alloc(struct ip_conntrack
*me
)
943 struct ip_conntrack_expect
*new;
945 new = kmem_cache_alloc(ip_conntrack_expect_cachep
, GFP_ATOMIC
);
947 DEBUGP("expect_related: OOM allocating expect\n");
951 atomic_set(&new->use
, 1);
955 void ip_conntrack_expect_put(struct ip_conntrack_expect
*exp
)
957 if (atomic_dec_and_test(&exp
->use
))
958 kmem_cache_free(ip_conntrack_expect_cachep
, exp
);
961 static void ip_conntrack_expect_insert(struct ip_conntrack_expect
*exp
)
963 atomic_inc(&exp
->use
);
964 exp
->master
->expecting
++;
965 list_add(&exp
->list
, &ip_conntrack_expect_list
);
967 init_timer(&exp
->timeout
);
968 exp
->timeout
.data
= (unsigned long)exp
;
969 exp
->timeout
.function
= expectation_timed_out
;
970 exp
->timeout
.expires
= jiffies
+ exp
->master
->helper
->timeout
* HZ
;
971 add_timer(&exp
->timeout
);
973 exp
->id
= ++ip_conntrack_expect_next_id
;
974 atomic_inc(&exp
->use
);
975 CONNTRACK_STAT_INC(expect_create
);
978 /* Race with expectations being used means we could have none to find; OK. */
979 static void evict_oldest_expect(struct ip_conntrack
*master
)
981 struct ip_conntrack_expect
*i
;
983 list_for_each_entry_reverse(i
, &ip_conntrack_expect_list
, list
) {
984 if (i
->master
== master
) {
985 if (del_timer(&i
->timeout
)) {
986 ip_ct_unlink_expect(i
);
987 ip_conntrack_expect_put(i
);
994 static inline int refresh_timer(struct ip_conntrack_expect
*i
)
996 if (!del_timer(&i
->timeout
))
999 i
->timeout
.expires
= jiffies
+ i
->master
->helper
->timeout
*HZ
;
1000 add_timer(&i
->timeout
);
1004 int ip_conntrack_expect_related(struct ip_conntrack_expect
*expect
)
1006 struct ip_conntrack_expect
*i
;
1009 DEBUGP("ip_conntrack_expect_related %p\n", related_to
);
1010 DEBUGP("tuple: "); DUMP_TUPLE(&expect
->tuple
);
1011 DEBUGP("mask: "); DUMP_TUPLE(&expect
->mask
);
1013 write_lock_bh(&ip_conntrack_lock
);
1014 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
1015 if (expect_matches(i
, expect
)) {
1016 /* Refresh timer: if it's dying, ignore.. */
1017 if (refresh_timer(i
)) {
1021 } else if (expect_clash(i
, expect
)) {
1027 /* Will be over limit? */
1028 if (expect
->master
->helper
->max_expected
&&
1029 expect
->master
->expecting
>= expect
->master
->helper
->max_expected
)
1030 evict_oldest_expect(expect
->master
);
1032 ip_conntrack_expect_insert(expect
);
1033 ip_conntrack_expect_event(IPEXP_NEW
, expect
);
1036 write_unlock_bh(&ip_conntrack_lock
);
1040 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1041 implicitly racy: see __ip_conntrack_confirm */
1042 void ip_conntrack_alter_reply(struct ip_conntrack
*conntrack
,
1043 const struct ip_conntrack_tuple
*newreply
)
1045 write_lock_bh(&ip_conntrack_lock
);
1046 /* Should be unconfirmed, so not in hash table yet */
1047 IP_NF_ASSERT(!is_confirmed(conntrack
));
1049 DEBUGP("Altering reply tuple of %p to ", conntrack
);
1050 DUMP_TUPLE(newreply
);
1052 conntrack
->tuplehash
[IP_CT_DIR_REPLY
].tuple
= *newreply
;
1053 if (!conntrack
->master
&& conntrack
->expecting
== 0)
1054 conntrack
->helper
= __ip_conntrack_helper_find(newreply
);
1055 write_unlock_bh(&ip_conntrack_lock
);
1058 int ip_conntrack_helper_register(struct ip_conntrack_helper
*me
)
1060 BUG_ON(me
->timeout
== 0);
1061 write_lock_bh(&ip_conntrack_lock
);
1062 list_add(&me
->list
, &helpers
);
1063 write_unlock_bh(&ip_conntrack_lock
);
1068 struct ip_conntrack_helper
*
1069 __ip_conntrack_helper_find_byname(const char *name
)
1071 struct ip_conntrack_helper
*h
;
1073 list_for_each_entry(h
, &helpers
, list
) {
1074 if (!strcmp(h
->name
, name
))
1081 static inline void unhelp(struct ip_conntrack_tuple_hash
*i
,
1082 const struct ip_conntrack_helper
*me
)
1084 if (tuplehash_to_ctrack(i
)->helper
== me
) {
1085 ip_conntrack_event(IPCT_HELPER
, tuplehash_to_ctrack(i
));
1086 tuplehash_to_ctrack(i
)->helper
= NULL
;
1090 void ip_conntrack_helper_unregister(struct ip_conntrack_helper
*me
)
1093 struct ip_conntrack_tuple_hash
*h
;
1094 struct ip_conntrack_expect
*exp
, *tmp
;
1096 /* Need write lock here, to delete helper. */
1097 write_lock_bh(&ip_conntrack_lock
);
1098 list_del(&me
->list
);
1100 /* Get rid of expectations */
1101 list_for_each_entry_safe(exp
, tmp
, &ip_conntrack_expect_list
, list
) {
1102 if (exp
->master
->helper
== me
&& del_timer(&exp
->timeout
)) {
1103 ip_ct_unlink_expect(exp
);
1104 ip_conntrack_expect_put(exp
);
1107 /* Get rid of expecteds, set helpers to NULL. */
1108 list_for_each_entry(h
, &unconfirmed
, list
)
1110 for (i
= 0; i
< ip_conntrack_htable_size
; i
++) {
1111 list_for_each_entry(h
, &ip_conntrack_hash
[i
], list
)
1114 write_unlock_bh(&ip_conntrack_lock
);
1116 /* Someone could be still looking at the helper in a bh. */
1120 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1121 void __ip_ct_refresh_acct(struct ip_conntrack
*ct
,
1122 enum ip_conntrack_info ctinfo
,
1123 const struct sk_buff
*skb
,
1124 unsigned long extra_jiffies
,
1129 IP_NF_ASSERT(ct
->timeout
.data
== (unsigned long)ct
);
1132 write_lock_bh(&ip_conntrack_lock
);
1134 /* Only update if this is not a fixed timeout */
1135 if (test_bit(IPS_FIXED_TIMEOUT_BIT
, &ct
->status
)) {
1136 write_unlock_bh(&ip_conntrack_lock
);
1140 /* If not in hash table, timer will not be active yet */
1141 if (!is_confirmed(ct
)) {
1142 ct
->timeout
.expires
= extra_jiffies
;
1143 event
= IPCT_REFRESH
;
1145 /* Need del_timer for race avoidance (may already be dying). */
1146 if (del_timer(&ct
->timeout
)) {
1147 ct
->timeout
.expires
= jiffies
+ extra_jiffies
;
1148 add_timer(&ct
->timeout
);
1149 event
= IPCT_REFRESH
;
1153 #ifdef CONFIG_IP_NF_CT_ACCT
1155 ct
->counters
[CTINFO2DIR(ctinfo
)].packets
++;
1156 ct
->counters
[CTINFO2DIR(ctinfo
)].bytes
+=
1157 ntohs(skb
->nh
.iph
->tot_len
);
1158 if ((ct
->counters
[CTINFO2DIR(ctinfo
)].packets
& 0x80000000)
1159 || (ct
->counters
[CTINFO2DIR(ctinfo
)].bytes
& 0x80000000))
1160 event
|= IPCT_COUNTER_FILLING
;
1164 write_unlock_bh(&ip_conntrack_lock
);
1166 /* must be unlocked when calling event cache */
1168 ip_conntrack_event_cache(event
, skb
);
1171 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1172 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1173 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1174 * in ip_conntrack_core, since we don't want the protocols to autoload
1175 * or depend on ctnetlink */
1176 int ip_ct_port_tuple_to_nfattr(struct sk_buff
*skb
,
1177 const struct ip_conntrack_tuple
*tuple
)
1179 NFA_PUT(skb
, CTA_PROTO_SRC_PORT
, sizeof(__be16
),
1180 &tuple
->src
.u
.tcp
.port
);
1181 NFA_PUT(skb
, CTA_PROTO_DST_PORT
, sizeof(__be16
),
1182 &tuple
->dst
.u
.tcp
.port
);
1189 int ip_ct_port_nfattr_to_tuple(struct nfattr
*tb
[],
1190 struct ip_conntrack_tuple
*t
)
1192 if (!tb
[CTA_PROTO_SRC_PORT
-1] || !tb
[CTA_PROTO_DST_PORT
-1])
1196 *(__be16
*)NFA_DATA(tb
[CTA_PROTO_SRC_PORT
-1]);
1198 *(__be16
*)NFA_DATA(tb
[CTA_PROTO_DST_PORT
-1]);
1204 /* Returns new sk_buff, or NULL */
1206 ip_ct_gather_frags(struct sk_buff
*skb
, u_int32_t user
)
1211 skb
= ip_defrag(skb
, user
);
1215 ip_send_check(skb
->nh
.iph
);
1219 /* Used by ipt_REJECT. */
1220 static void ip_conntrack_attach(struct sk_buff
*nskb
, struct sk_buff
*skb
)
1222 struct ip_conntrack
*ct
;
1223 enum ip_conntrack_info ctinfo
;
1225 /* This ICMP is in reverse direction to the packet which caused it */
1226 ct
= ip_conntrack_get(skb
, &ctinfo
);
1228 if (CTINFO2DIR(ctinfo
) == IP_CT_DIR_ORIGINAL
)
1229 ctinfo
= IP_CT_RELATED
+ IP_CT_IS_REPLY
;
1231 ctinfo
= IP_CT_RELATED
;
1233 /* Attach to new skbuff, and increment count */
1234 nskb
->nfct
= &ct
->ct_general
;
1235 nskb
->nfctinfo
= ctinfo
;
1236 nf_conntrack_get(nskb
->nfct
);
1239 /* Bring out ya dead! */
1240 static struct ip_conntrack
*
1241 get_next_corpse(int (*iter
)(struct ip_conntrack
*i
, void *data
),
1242 void *data
, unsigned int *bucket
)
1244 struct ip_conntrack_tuple_hash
*h
;
1245 struct ip_conntrack
*ct
;
1247 write_lock_bh(&ip_conntrack_lock
);
1248 for (; *bucket
< ip_conntrack_htable_size
; (*bucket
)++) {
1249 list_for_each_entry(h
, &ip_conntrack_hash
[*bucket
], list
) {
1250 ct
= tuplehash_to_ctrack(h
);
1255 list_for_each_entry(h
, &unconfirmed
, list
) {
1256 ct
= tuplehash_to_ctrack(h
);
1260 write_unlock_bh(&ip_conntrack_lock
);
1264 atomic_inc(&ct
->ct_general
.use
);
1265 write_unlock_bh(&ip_conntrack_lock
);
1270 ip_ct_iterate_cleanup(int (*iter
)(struct ip_conntrack
*i
, void *), void *data
)
1272 struct ip_conntrack
*ct
;
1273 unsigned int bucket
= 0;
1275 while ((ct
= get_next_corpse(iter
, data
, &bucket
)) != NULL
) {
1276 /* Time to push up daises... */
1277 if (del_timer(&ct
->timeout
))
1278 death_by_timeout((unsigned long)ct
);
1279 /* ... else the timer will get him soon. */
1281 ip_conntrack_put(ct
);
1285 /* Fast function for those who don't want to parse /proc (and I don't
1287 /* Reversing the socket's dst/src point of view gives us the reply
1290 getorigdst(struct sock
*sk
, int optval
, void __user
*user
, int *len
)
1292 struct inet_sock
*inet
= inet_sk(sk
);
1293 struct ip_conntrack_tuple_hash
*h
;
1294 struct ip_conntrack_tuple tuple
;
1296 IP_CT_TUPLE_U_BLANK(&tuple
);
1297 tuple
.src
.ip
= inet
->rcv_saddr
;
1298 tuple
.src
.u
.tcp
.port
= inet
->sport
;
1299 tuple
.dst
.ip
= inet
->daddr
;
1300 tuple
.dst
.u
.tcp
.port
= inet
->dport
;
1301 tuple
.dst
.protonum
= IPPROTO_TCP
;
1303 /* We only do TCP at the moment: is there a better way? */
1304 if (strcmp(sk
->sk_prot
->name
, "TCP")) {
1305 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1306 return -ENOPROTOOPT
;
1309 if ((unsigned int) *len
< sizeof(struct sockaddr_in
)) {
1310 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1311 *len
, sizeof(struct sockaddr_in
));
1315 h
= ip_conntrack_find_get(&tuple
, NULL
);
1317 struct sockaddr_in sin
;
1318 struct ip_conntrack
*ct
= tuplehash_to_ctrack(h
);
1320 sin
.sin_family
= AF_INET
;
1321 sin
.sin_port
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
]
1322 .tuple
.dst
.u
.tcp
.port
;
1323 sin
.sin_addr
.s_addr
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
]
1325 memset(sin
.sin_zero
, 0, sizeof(sin
.sin_zero
));
1327 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1328 NIPQUAD(sin
.sin_addr
.s_addr
), ntohs(sin
.sin_port
));
1329 ip_conntrack_put(ct
);
1330 if (copy_to_user(user
, &sin
, sizeof(sin
)) != 0)
1335 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1336 NIPQUAD(tuple
.src
.ip
), ntohs(tuple
.src
.u
.tcp
.port
),
1337 NIPQUAD(tuple
.dst
.ip
), ntohs(tuple
.dst
.u
.tcp
.port
));
1341 static struct nf_sockopt_ops so_getorigdst
= {
1343 .get_optmin
= SO_ORIGINAL_DST
,
1344 .get_optmax
= SO_ORIGINAL_DST
+1,
1348 static int kill_all(struct ip_conntrack
*i
, void *data
)
1353 void ip_conntrack_flush(void)
1355 ip_ct_iterate_cleanup(kill_all
, NULL
);
1358 static void free_conntrack_hash(struct list_head
*hash
, int vmalloced
,int size
)
1363 free_pages((unsigned long)hash
,
1364 get_order(sizeof(struct list_head
) * size
));
1367 /* Mishearing the voices in his head, our hero wonders how he's
1368 supposed to kill the mall. */
1369 void ip_conntrack_cleanup(void)
1371 ip_ct_attach
= NULL
;
1373 /* This makes sure all current packets have passed through
1374 netfilter framework. Roll on, two-stage module
1378 ip_ct_event_cache_flush();
1380 ip_conntrack_flush();
1381 if (atomic_read(&ip_conntrack_count
) != 0) {
1383 goto i_see_dead_people
;
1385 /* wait until all references to ip_conntrack_untracked are dropped */
1386 while (atomic_read(&ip_conntrack_untracked
.ct_general
.use
) > 1)
1389 kmem_cache_destroy(ip_conntrack_cachep
);
1390 kmem_cache_destroy(ip_conntrack_expect_cachep
);
1391 free_conntrack_hash(ip_conntrack_hash
, ip_conntrack_vmalloc
,
1392 ip_conntrack_htable_size
);
1393 nf_unregister_sockopt(&so_getorigdst
);
1396 static struct list_head
*alloc_hashtable(int size
, int *vmalloced
)
1398 struct list_head
*hash
;
1402 hash
= (void*)__get_free_pages(GFP_KERNEL
,
1403 get_order(sizeof(struct list_head
)
1407 printk(KERN_WARNING
"ip_conntrack: falling back to vmalloc.\n");
1408 hash
= vmalloc(sizeof(struct list_head
) * size
);
1412 for (i
= 0; i
< size
; i
++)
1413 INIT_LIST_HEAD(&hash
[i
]);
1418 static int set_hashsize(const char *val
, struct kernel_param
*kp
)
1420 int i
, bucket
, hashsize
, vmalloced
;
1421 int old_vmalloced
, old_size
;
1423 struct list_head
*hash
, *old_hash
;
1424 struct ip_conntrack_tuple_hash
*h
;
1426 /* On boot, we can set this without any fancy locking. */
1427 if (!ip_conntrack_htable_size
)
1428 return param_set_int(val
, kp
);
1430 hashsize
= simple_strtol(val
, NULL
, 0);
1434 hash
= alloc_hashtable(hashsize
, &vmalloced
);
1438 /* We have to rehash for the new table anyway, so we also can
1439 * use a new random seed */
1440 get_random_bytes(&rnd
, 4);
1442 write_lock_bh(&ip_conntrack_lock
);
1443 for (i
= 0; i
< ip_conntrack_htable_size
; i
++) {
1444 while (!list_empty(&ip_conntrack_hash
[i
])) {
1445 h
= list_entry(ip_conntrack_hash
[i
].next
,
1446 struct ip_conntrack_tuple_hash
, list
);
1448 bucket
= __hash_conntrack(&h
->tuple
, hashsize
, rnd
);
1449 list_add_tail(&h
->list
, &hash
[bucket
]);
1452 old_size
= ip_conntrack_htable_size
;
1453 old_vmalloced
= ip_conntrack_vmalloc
;
1454 old_hash
= ip_conntrack_hash
;
1456 ip_conntrack_htable_size
= hashsize
;
1457 ip_conntrack_vmalloc
= vmalloced
;
1458 ip_conntrack_hash
= hash
;
1459 ip_conntrack_hash_rnd
= rnd
;
1460 write_unlock_bh(&ip_conntrack_lock
);
1462 free_conntrack_hash(old_hash
, old_vmalloced
, old_size
);
1466 module_param_call(hashsize
, set_hashsize
, param_get_uint
,
1467 &ip_conntrack_htable_size
, 0600);
1469 int __init
ip_conntrack_init(void)
1474 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1475 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1476 if (!ip_conntrack_htable_size
) {
1477 ip_conntrack_htable_size
1478 = (((num_physpages
<< PAGE_SHIFT
) / 16384)
1479 / sizeof(struct list_head
));
1480 if (num_physpages
> (1024 * 1024 * 1024 / PAGE_SIZE
))
1481 ip_conntrack_htable_size
= 8192;
1482 if (ip_conntrack_htable_size
< 16)
1483 ip_conntrack_htable_size
= 16;
1485 ip_conntrack_max
= 8 * ip_conntrack_htable_size
;
1487 printk("ip_conntrack version %s (%u buckets, %d max)"
1488 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION
,
1489 ip_conntrack_htable_size
, ip_conntrack_max
,
1490 sizeof(struct ip_conntrack
));
1492 ret
= nf_register_sockopt(&so_getorigdst
);
1494 printk(KERN_ERR
"Unable to register netfilter socket option\n");
1498 ip_conntrack_hash
= alloc_hashtable(ip_conntrack_htable_size
,
1499 &ip_conntrack_vmalloc
);
1500 if (!ip_conntrack_hash
) {
1501 printk(KERN_ERR
"Unable to create ip_conntrack_hash\n");
1502 goto err_unreg_sockopt
;
1505 ip_conntrack_cachep
= kmem_cache_create("ip_conntrack",
1506 sizeof(struct ip_conntrack
), 0,
1508 if (!ip_conntrack_cachep
) {
1509 printk(KERN_ERR
"Unable to create ip_conntrack slab cache\n");
1513 ip_conntrack_expect_cachep
= kmem_cache_create("ip_conntrack_expect",
1514 sizeof(struct ip_conntrack_expect
),
1516 if (!ip_conntrack_expect_cachep
) {
1517 printk(KERN_ERR
"Unable to create ip_expect slab cache\n");
1518 goto err_free_conntrack_slab
;
1521 /* Don't NEED lock here, but good form anyway. */
1522 write_lock_bh(&ip_conntrack_lock
);
1523 for (i
= 0; i
< MAX_IP_CT_PROTO
; i
++)
1524 ip_ct_protos
[i
] = &ip_conntrack_generic_protocol
;
1525 /* Sew in builtin protocols. */
1526 ip_ct_protos
[IPPROTO_TCP
] = &ip_conntrack_protocol_tcp
;
1527 ip_ct_protos
[IPPROTO_UDP
] = &ip_conntrack_protocol_udp
;
1528 ip_ct_protos
[IPPROTO_ICMP
] = &ip_conntrack_protocol_icmp
;
1529 write_unlock_bh(&ip_conntrack_lock
);
1531 /* For use by ipt_REJECT */
1532 ip_ct_attach
= ip_conntrack_attach
;
1534 /* Set up fake conntrack:
1535 - to never be deleted, not in any hashes */
1536 atomic_set(&ip_conntrack_untracked
.ct_general
.use
, 1);
1537 /* - and look it like as a confirmed connection */
1538 set_bit(IPS_CONFIRMED_BIT
, &ip_conntrack_untracked
.status
);
1542 err_free_conntrack_slab
:
1543 kmem_cache_destroy(ip_conntrack_cachep
);
1545 free_conntrack_hash(ip_conntrack_hash
, ip_conntrack_vmalloc
,
1546 ip_conntrack_htable_size
);
1548 nf_unregister_sockopt(&so_getorigdst
);