Linux-2.6.12-rc2
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
blob28d9425d5c390dac7601953e65043e973833e7aa
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
41 /* This rwlock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
52 #define IP_CONNTRACK_VERSION "2.1"
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
60 DECLARE_RWLOCK(ip_conntrack_lock);
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
70 int ip_conntrack_max;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
81 void
82 ip_conntrack_put(struct ip_conntrack *ct)
84 IP_NF_ASSERT(ct);
85 nf_conntrack_put(&ct->ct_general);
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
94 #if 0
95 dump_tuple(tuple);
96 #endif
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
104 ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
110 /* Never happen */
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
113 iph->protocol);
114 return 0;
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
135 return protocol->invert_tuple(inverse, orig);
139 /* ip_conntrack_expect helper functions */
140 static void destroy_expect(struct ip_conntrack_expect *exp)
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
148 static void unlink_expect(struct ip_conntrack_expect *exp)
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--;
156 static void expectation_timed_out(unsigned long ul_expect)
158 struct ip_conntrack_expect *exp = (void *)ul_expect;
160 WRITE_LOCK(&ip_conntrack_lock);
161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock);
163 destroy_expect(exp);
166 /* If an expectation for this connection is found, it gets delete from
167 * global list then returned. */
168 static struct ip_conntrack_expect *
169 find_expectation(const struct ip_conntrack_tuple *tuple)
171 struct ip_conntrack_expect *i;
173 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
174 /* If master is not in hash table yet (ie. packet hasn't left
175 this machine yet), how can other end know about expected?
176 Hence these are not the droids you are looking for (if
177 master ct never got confirmed, we'd hold a reference to it
178 and weird things would happen to future packets). */
179 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
180 && is_confirmed(i->master)
181 && del_timer(&i->timeout)) {
182 unlink_expect(i);
183 return i;
186 return NULL;
189 /* delete all expectations for this conntrack */
190 static void remove_expectations(struct ip_conntrack *ct)
192 struct ip_conntrack_expect *i, *tmp;
194 /* Optimization: most connection never expect any others. */
195 if (ct->expecting == 0)
196 return;
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i);
201 destroy_expect(i);
206 static void
207 clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr;
211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
216 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
217 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
219 /* Destroy all pending expectations */
220 remove_expectations(ct);
223 static void
224 destroy_conntrack(struct nf_conntrack *nfct)
226 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
227 struct ip_conntrack_protocol *proto;
229 DEBUGP("destroy_conntrack(%p)\n", ct);
230 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
231 IP_NF_ASSERT(!timer_pending(&ct->timeout));
233 /* To make sure we don't get any weird locking issues here:
234 * destroy_conntrack() MUST NOT be called with a write lock
235 * to ip_conntrack_lock!!! -HW */
236 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
237 if (proto && proto->destroy)
238 proto->destroy(ct);
240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct);
243 WRITE_LOCK(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here,
247 * too. */
248 remove_expectations(ct);
250 /* We overload first tuple to link into unconfirmed list. */
251 if (!is_confirmed(ct)) {
252 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
253 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock);
259 if (ct->master)
260 ip_conntrack_put(ct->master);
262 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
263 kmem_cache_free(ip_conntrack_cachep, ct);
264 atomic_dec(&ip_conntrack_count);
267 static void death_by_timeout(unsigned long ul_conntrack)
269 struct ip_conntrack *ct = (void *)ul_conntrack;
271 WRITE_LOCK(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock);
277 ip_conntrack_put(ct);
280 static inline int
281 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack)
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple);
290 static struct ip_conntrack_tuple_hash *
291 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
292 const struct ip_conntrack *ignored_conntrack)
294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple);
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found);
301 return h;
303 CONNTRACK_STAT_INC(searched);
306 return NULL;
309 /* Find a connection corresponding to a tuple. */
310 struct ip_conntrack_tuple_hash *
311 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
312 const struct ip_conntrack *ignored_conntrack)
314 struct ip_conntrack_tuple_hash *h;
316 READ_LOCK(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock);
322 return h;
325 /* Confirm a connection given skb; places it in hash table */
327 __ip_conntrack_confirm(struct sk_buff **pskb)
329 unsigned int hash, repl_hash;
330 struct ip_conntrack *ct;
331 enum ip_conntrack_info ctinfo;
333 ct = ip_conntrack_get(*pskb, &ctinfo);
335 /* ipt_REJECT uses ip_conntrack_attach to attach related
336 ICMP/TCP RST packets in other direction. Actual packet
337 which created connection will be IP_CT_NEW or for an
338 expected connection, IP_CT_RELATED. */
339 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
340 return NF_ACCEPT;
342 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
343 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
345 /* We're not in hash table, and we refuse to set up related
346 connections for unconfirmed conns. But packet copies and
347 REJECT will give spurious warnings here. */
348 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
350 /* No external references means noone else could have
351 confirmed us. */
352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct);
355 WRITE_LOCK(&ip_conntrack_lock);
357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're
359 not in the hash. If there is, we lost race. */
360 if (!LIST_FIND(&ip_conntrack_hash[hash],
361 conntrack_tuple_cmp,
362 struct ip_conntrack_tuple_hash *,
363 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
364 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
365 conntrack_tuple_cmp,
366 struct ip_conntrack_tuple_hash *,
367 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
368 /* Remove from unconfirmed list */
369 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
371 list_prepend(&ip_conntrack_hash[hash],
372 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
373 list_prepend(&ip_conntrack_hash[repl_hash],
374 &ct->tuplehash[IP_CT_DIR_REPLY]);
375 /* Timer relative to confirmation time, not original
376 setting time, otherwise we'd get timer wrap in
377 weird delay cases. */
378 ct->timeout.expires += jiffies;
379 add_timer(&ct->timeout);
380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock);
384 return NF_ACCEPT;
387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock);
390 return NF_DROP;
393 /* Returns true if a connection correspondings to the tuple (required
394 for NAT). */
396 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
397 const struct ip_conntrack *ignored_conntrack)
399 struct ip_conntrack_tuple_hash *h;
401 READ_LOCK(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock);
405 return h != NULL;
408 /* There's a small race here where we may free a just-assured
409 connection. Too bad: we're in trouble anyway. */
410 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
412 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
415 static int early_drop(struct list_head *chain)
417 /* Traverse backwards: gives us oldest, which is roughly LRU */
418 struct ip_conntrack_tuple_hash *h;
419 struct ip_conntrack *ct = NULL;
420 int dropped = 0;
422 READ_LOCK(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) {
425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use);
428 READ_UNLOCK(&ip_conntrack_lock);
430 if (!ct)
431 return dropped;
433 if (del_timer(&ct->timeout)) {
434 death_by_timeout((unsigned long)ct);
435 dropped = 1;
436 CONNTRACK_STAT_INC(early_drop);
438 ip_conntrack_put(ct);
439 return dropped;
442 static inline int helper_cmp(const struct ip_conntrack_helper *i,
443 const struct ip_conntrack_tuple *rtuple)
445 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
448 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
450 return LIST_FIND(&helpers, helper_cmp,
451 struct ip_conntrack_helper *,
452 tuple);
455 /* Allocate a new conntrack: we return -ENOMEM if classification
456 failed due to stress. Otherwise it really is unclassifiable. */
457 static struct ip_conntrack_tuple_hash *
458 init_conntrack(const struct ip_conntrack_tuple *tuple,
459 struct ip_conntrack_protocol *protocol,
460 struct sk_buff *skb)
462 struct ip_conntrack *conntrack;
463 struct ip_conntrack_tuple repl_tuple;
464 size_t hash;
465 struct ip_conntrack_expect *exp;
467 if (!ip_conntrack_hash_rnd_initted) {
468 get_random_bytes(&ip_conntrack_hash_rnd, 4);
469 ip_conntrack_hash_rnd_initted = 1;
472 hash = hash_conntrack(tuple);
474 if (ip_conntrack_max
475 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
476 /* Try dropping from this hash chain. */
477 if (!early_drop(&ip_conntrack_hash[hash])) {
478 if (net_ratelimit())
479 printk(KERN_WARNING
480 "ip_conntrack: table full, dropping"
481 " packet.\n");
482 return ERR_PTR(-ENOMEM);
486 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
487 DEBUGP("Can't invert tuple.\n");
488 return NULL;
491 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
492 if (!conntrack) {
493 DEBUGP("Can't allocate conntrack.\n");
494 return ERR_PTR(-ENOMEM);
497 memset(conntrack, 0, sizeof(*conntrack));
498 atomic_set(&conntrack->ct_general.use, 1);
499 conntrack->ct_general.destroy = destroy_conntrack;
500 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
501 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
502 if (!protocol->new(conntrack, skb)) {
503 kmem_cache_free(ip_conntrack_cachep, conntrack);
504 return NULL;
506 /* Don't set timer yet: wait for confirmation */
507 init_timer(&conntrack->timeout);
508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout;
511 WRITE_LOCK(&ip_conntrack_lock);
512 exp = find_expectation(tuple);
514 if (exp) {
515 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
516 conntrack, exp);
517 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master;
520 #if CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark;
522 #endif
523 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new);
525 } else {
526 conntrack->helper = ip_ct_find_helper(&repl_tuple);
528 CONNTRACK_STAT_INC(new);
531 /* Overload tuple linked list to put us in unconfirmed list. */
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock);
537 if (exp) {
538 if (exp->expectfn)
539 exp->expectfn(conntrack, exp);
540 destroy_expect(exp);
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
546 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
547 static inline struct ip_conntrack *
548 resolve_normal_ct(struct sk_buff *skb,
549 struct ip_conntrack_protocol *proto,
550 int *set_reply,
551 unsigned int hooknum,
552 enum ip_conntrack_info *ctinfo)
554 struct ip_conntrack_tuple tuple;
555 struct ip_conntrack_tuple_hash *h;
556 struct ip_conntrack *ct;
558 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
560 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
561 &tuple,proto))
562 return NULL;
564 /* look for tuple match */
565 h = ip_conntrack_find_get(&tuple, NULL);
566 if (!h) {
567 h = init_conntrack(&tuple, proto, skb);
568 if (!h)
569 return NULL;
570 if (IS_ERR(h))
571 return (void *)h;
573 ct = tuplehash_to_ctrack(h);
575 /* It exists; we have (non-exclusive) reference. */
576 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
577 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
578 /* Please set reply bit if this packet OK */
579 *set_reply = 1;
580 } else {
581 /* Once we've had two way comms, always ESTABLISHED. */
582 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
583 DEBUGP("ip_conntrack_in: normal packet for %p\n",
584 ct);
585 *ctinfo = IP_CT_ESTABLISHED;
586 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
587 DEBUGP("ip_conntrack_in: related packet for %p\n",
588 ct);
589 *ctinfo = IP_CT_RELATED;
590 } else {
591 DEBUGP("ip_conntrack_in: new packet for %p\n",
592 ct);
593 *ctinfo = IP_CT_NEW;
595 *set_reply = 0;
597 skb->nfct = &ct->ct_general;
598 skb->nfctinfo = *ctinfo;
599 return ct;
602 /* Netfilter hook itself. */
603 unsigned int ip_conntrack_in(unsigned int hooknum,
604 struct sk_buff **pskb,
605 const struct net_device *in,
606 const struct net_device *out,
607 int (*okfn)(struct sk_buff *))
609 struct ip_conntrack *ct;
610 enum ip_conntrack_info ctinfo;
611 struct ip_conntrack_protocol *proto;
612 int set_reply;
613 int ret;
615 /* Previously seen (loopback or untracked)? Ignore. */
616 if ((*pskb)->nfct) {
617 CONNTRACK_STAT_INC(ignore);
618 return NF_ACCEPT;
621 /* Never happen */
622 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
623 if (net_ratelimit()) {
624 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
625 (*pskb)->nh.iph->protocol, hooknum);
627 return NF_DROP;
630 /* FIXME: Do this right please. --RR */
631 (*pskb)->nfcache |= NFC_UNKNOWN;
633 /* Doesn't cover locally-generated broadcast, so not worth it. */
634 #if 0
635 /* Ignore broadcast: no `connection'. */
636 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
637 printk("Broadcast packet!\n");
638 return NF_ACCEPT;
639 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
640 == htonl(0x000000FF)) {
641 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
642 NIPQUAD((*pskb)->nh.iph->saddr),
643 NIPQUAD((*pskb)->nh.iph->daddr),
644 (*pskb)->sk, (*pskb)->pkt_type);
646 #endif
648 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
650 /* It may be an special packet, error, unclean...
651 * inverse of the return code tells to the netfilter
652 * core what to do with the packet. */
653 if (proto->error != NULL
654 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
655 CONNTRACK_STAT_INC(error);
656 CONNTRACK_STAT_INC(invalid);
657 return -ret;
660 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
661 /* Not valid part of a connection */
662 CONNTRACK_STAT_INC(invalid);
663 return NF_ACCEPT;
666 if (IS_ERR(ct)) {
667 /* Too stressed to deal. */
668 CONNTRACK_STAT_INC(drop);
669 return NF_DROP;
672 IP_NF_ASSERT((*pskb)->nfct);
674 ret = proto->packet(ct, *pskb, ctinfo);
675 if (ret < 0) {
676 /* Invalid: inverse of the return code tells
677 * the netfilter core what to do*/
678 nf_conntrack_put((*pskb)->nfct);
679 (*pskb)->nfct = NULL;
680 CONNTRACK_STAT_INC(invalid);
681 return -ret;
684 if (set_reply)
685 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
687 return ret;
690 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
691 const struct ip_conntrack_tuple *orig)
693 return ip_ct_invert_tuple(inverse, orig,
694 ip_ct_find_proto(orig->dst.protonum));
697 /* Would two expected things clash? */
698 static inline int expect_clash(const struct ip_conntrack_expect *a,
699 const struct ip_conntrack_expect *b)
701 /* Part covered by intersection of masks must be unequal,
702 otherwise they clash */
703 struct ip_conntrack_tuple intersect_mask
704 = { { a->mask.src.ip & b->mask.src.ip,
705 { a->mask.src.u.all & b->mask.src.u.all } },
706 { a->mask.dst.ip & b->mask.dst.ip,
707 { a->mask.dst.u.all & b->mask.dst.u.all },
708 a->mask.dst.protonum & b->mask.dst.protonum } };
710 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
713 static inline int expect_matches(const struct ip_conntrack_expect *a,
714 const struct ip_conntrack_expect *b)
716 return a->master == b->master
717 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
718 && ip_ct_tuple_equal(&a->mask, &b->mask);
721 /* Generally a bad idea to call this: could have matched already. */
722 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
724 struct ip_conntrack_expect *i;
726 WRITE_LOCK(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock);
732 destroy_expect(i);
733 return;
736 WRITE_UNLOCK(&ip_conntrack_lock);
739 struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
741 struct ip_conntrack_expect *new;
743 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
744 if (!new) {
745 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL;
748 new->master = NULL;
749 return new;
752 void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
754 kmem_cache_free(ip_conntrack_expect_cachep, expect);
757 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
759 atomic_inc(&exp->master->ct_general.use);
760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list);
763 if (exp->master->helper->timeout) {
764 init_timer(&exp->timeout);
765 exp->timeout.data = (unsigned long)exp;
766 exp->timeout.function = expectation_timed_out;
767 exp->timeout.expires
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
773 CONNTRACK_STAT_INC(expect_create);
776 /* Race with expectations being used means we could have none to find; OK. */
777 static void evict_oldest_expect(struct ip_conntrack *master)
779 struct ip_conntrack_expect *i;
781 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
782 if (i->master == master) {
783 if (del_timer(&i->timeout)) {
784 unlink_expect(i);
785 destroy_expect(i);
787 break;
792 static inline int refresh_timer(struct ip_conntrack_expect *i)
794 if (!del_timer(&i->timeout))
795 return 0;
797 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
798 add_timer(&i->timeout);
799 return 1;
802 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
804 struct ip_conntrack_expect *i;
805 int ret;
807 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
811 WRITE_LOCK(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */
815 if (refresh_timer(i)) {
816 ret = 0;
817 /* We don't need the one they've given us. */
818 ip_conntrack_expect_free(expect);
819 goto out;
821 } else if (expect_clash(i, expect)) {
822 ret = -EBUSY;
823 goto out;
827 /* Will be over limit? */
828 if (expect->master->helper->max_expected &&
829 expect->master->expecting >= expect->master->helper->max_expected)
830 evict_oldest_expect(expect->master);
832 ip_conntrack_expect_insert(expect);
833 ret = 0;
834 out:
835 WRITE_UNLOCK(&ip_conntrack_lock);
836 return ret;
839 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
840 implicitly racy: see __ip_conntrack_confirm */
841 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply)
844 WRITE_LOCK(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack));
848 DEBUGP("Altering reply tuple of %p to ", conntrack);
849 DUMP_TUPLE(newreply);
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock);
857 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
859 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock);
861 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock);
864 return 0;
867 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
868 const struct ip_conntrack_helper *me)
870 if (tuplehash_to_ctrack(i)->helper == me)
871 tuplehash_to_ctrack(i)->helper = NULL;
872 return 0;
875 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
877 unsigned int i;
878 struct ip_conntrack_expect *exp, *tmp;
880 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me);
884 /* Get rid of expectations */
885 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
886 if (exp->master->helper == me && del_timer(&exp->timeout)) {
887 unlink_expect(exp);
888 destroy_expect(exp);
891 /* Get rid of expecteds, set helpers to NULL. */
892 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
893 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock);
898 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net();
902 static inline void ct_add_counters(struct ip_conntrack *ct,
903 enum ip_conntrack_info ctinfo,
904 const struct sk_buff *skb)
906 #ifdef CONFIG_IP_NF_CT_ACCT
907 if (skb) {
908 ct->counters[CTINFO2DIR(ctinfo)].packets++;
909 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
910 ntohs(skb->nh.iph->tot_len);
912 #endif
915 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
916 void ip_ct_refresh_acct(struct ip_conntrack *ct,
917 enum ip_conntrack_info ctinfo,
918 const struct sk_buff *skb,
919 unsigned long extra_jiffies)
921 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
923 /* If not in hash table, timer will not be active yet */
924 if (!is_confirmed(ct)) {
925 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb);
927 } else {
928 WRITE_LOCK(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout);
934 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock);
939 /* Returns new sk_buff, or NULL */
940 struct sk_buff *
941 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
943 struct sock *sk = skb->sk;
944 #ifdef CONFIG_NETFILTER_DEBUG
945 unsigned int olddebug = skb->nf_debug;
946 #endif
948 if (sk) {
949 sock_hold(sk);
950 skb_orphan(skb);
953 local_bh_disable();
954 skb = ip_defrag(skb, user);
955 local_bh_enable();
957 if (!skb) {
958 if (sk)
959 sock_put(sk);
960 return skb;
963 if (sk) {
964 skb_set_owner_w(skb, sk);
965 sock_put(sk);
968 ip_send_check(skb->nh.iph);
969 skb->nfcache |= NFC_ALTERED;
970 #ifdef CONFIG_NETFILTER_DEBUG
971 /* Packet path as if nothing had happened. */
972 skb->nf_debug = olddebug;
973 #endif
974 return skb;
977 /* Used by ipt_REJECT. */
978 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
980 struct ip_conntrack *ct;
981 enum ip_conntrack_info ctinfo;
983 /* This ICMP is in reverse direction to the packet which caused it */
984 ct = ip_conntrack_get(skb, &ctinfo);
986 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
987 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
988 else
989 ctinfo = IP_CT_RELATED;
991 /* Attach to new skbuff, and increment count */
992 nskb->nfct = &ct->ct_general;
993 nskb->nfctinfo = ctinfo;
994 nf_conntrack_get(nskb->nfct);
997 static inline int
998 do_iter(const struct ip_conntrack_tuple_hash *i,
999 int (*iter)(struct ip_conntrack *i, void *data),
1000 void *data)
1002 return iter(tuplehash_to_ctrack(i), data);
1005 /* Bring out ya dead! */
1006 static struct ip_conntrack_tuple_hash *
1007 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1008 void *data, unsigned int *bucket)
1010 struct ip_conntrack_tuple_hash *h = NULL;
1012 WRITE_LOCK(&ip_conntrack_lock);
1013 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1014 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1015 struct ip_conntrack_tuple_hash *, iter, data);
1016 if (h)
1017 break;
1019 if (!h)
1020 h = LIST_FIND_W(&unconfirmed, do_iter,
1021 struct ip_conntrack_tuple_hash *, iter, data);
1022 if (h)
1023 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1024 WRITE_UNLOCK(&ip_conntrack_lock);
1026 return h;
1029 void
1030 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1032 struct ip_conntrack_tuple_hash *h;
1033 unsigned int bucket = 0;
1035 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1036 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1037 /* Time to push up daises... */
1038 if (del_timer(&ct->timeout))
1039 death_by_timeout((unsigned long)ct);
1040 /* ... else the timer will get him soon. */
1042 ip_conntrack_put(ct);
1046 /* Fast function for those who don't want to parse /proc (and I don't
1047 blame them). */
1048 /* Reversing the socket's dst/src point of view gives us the reply
1049 mapping. */
1050 static int
1051 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1053 struct inet_sock *inet = inet_sk(sk);
1054 struct ip_conntrack_tuple_hash *h;
1055 struct ip_conntrack_tuple tuple;
1057 IP_CT_TUPLE_U_BLANK(&tuple);
1058 tuple.src.ip = inet->rcv_saddr;
1059 tuple.src.u.tcp.port = inet->sport;
1060 tuple.dst.ip = inet->daddr;
1061 tuple.dst.u.tcp.port = inet->dport;
1062 tuple.dst.protonum = IPPROTO_TCP;
1064 /* We only do TCP at the moment: is there a better way? */
1065 if (strcmp(sk->sk_prot->name, "TCP")) {
1066 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1067 return -ENOPROTOOPT;
1070 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1071 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1072 *len, sizeof(struct sockaddr_in));
1073 return -EINVAL;
1076 h = ip_conntrack_find_get(&tuple, NULL);
1077 if (h) {
1078 struct sockaddr_in sin;
1079 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1081 sin.sin_family = AF_INET;
1082 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1083 .tuple.dst.u.tcp.port;
1084 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1085 .tuple.dst.ip;
1087 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1088 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1089 ip_conntrack_put(ct);
1090 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1091 return -EFAULT;
1092 else
1093 return 0;
1095 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1096 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1097 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1098 return -ENOENT;
1101 static struct nf_sockopt_ops so_getorigdst = {
1102 .pf = PF_INET,
1103 .get_optmin = SO_ORIGINAL_DST,
1104 .get_optmax = SO_ORIGINAL_DST+1,
1105 .get = &getorigdst,
1108 static int kill_all(struct ip_conntrack *i, void *data)
1110 return 1;
1113 static void free_conntrack_hash(void)
1115 if (ip_conntrack_vmalloc)
1116 vfree(ip_conntrack_hash);
1117 else
1118 free_pages((unsigned long)ip_conntrack_hash,
1119 get_order(sizeof(struct list_head)
1120 * ip_conntrack_htable_size));
1123 /* Mishearing the voices in his head, our hero wonders how he's
1124 supposed to kill the mall. */
1125 void ip_conntrack_cleanup(void)
1127 ip_ct_attach = NULL;
1128 /* This makes sure all current packets have passed through
1129 netfilter framework. Roll on, two-stage module
1130 delete... */
1131 synchronize_net();
1133 i_see_dead_people:
1134 ip_ct_iterate_cleanup(kill_all, NULL);
1135 if (atomic_read(&ip_conntrack_count) != 0) {
1136 schedule();
1137 goto i_see_dead_people;
1140 kmem_cache_destroy(ip_conntrack_cachep);
1141 kmem_cache_destroy(ip_conntrack_expect_cachep);
1142 free_conntrack_hash();
1143 nf_unregister_sockopt(&so_getorigdst);
1146 static int hashsize;
1147 module_param(hashsize, int, 0400);
1149 int __init ip_conntrack_init(void)
1151 unsigned int i;
1152 int ret;
1154 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1155 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1156 if (hashsize) {
1157 ip_conntrack_htable_size = hashsize;
1158 } else {
1159 ip_conntrack_htable_size
1160 = (((num_physpages << PAGE_SHIFT) / 16384)
1161 / sizeof(struct list_head));
1162 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1163 ip_conntrack_htable_size = 8192;
1164 if (ip_conntrack_htable_size < 16)
1165 ip_conntrack_htable_size = 16;
1167 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1169 printk("ip_conntrack version %s (%u buckets, %d max)"
1170 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1171 ip_conntrack_htable_size, ip_conntrack_max,
1172 sizeof(struct ip_conntrack));
1174 ret = nf_register_sockopt(&so_getorigdst);
1175 if (ret != 0) {
1176 printk(KERN_ERR "Unable to register netfilter socket option\n");
1177 return ret;
1180 /* AK: the hash table is twice as big than needed because it
1181 uses list_head. it would be much nicer to caches to use a
1182 single pointer list head here. */
1183 ip_conntrack_vmalloc = 0;
1184 ip_conntrack_hash
1185 =(void*)__get_free_pages(GFP_KERNEL,
1186 get_order(sizeof(struct list_head)
1187 *ip_conntrack_htable_size));
1188 if (!ip_conntrack_hash) {
1189 ip_conntrack_vmalloc = 1;
1190 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1191 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1192 * ip_conntrack_htable_size);
1194 if (!ip_conntrack_hash) {
1195 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1196 goto err_unreg_sockopt;
1199 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1200 sizeof(struct ip_conntrack), 0,
1201 0, NULL, NULL);
1202 if (!ip_conntrack_cachep) {
1203 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1204 goto err_free_hash;
1207 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1208 sizeof(struct ip_conntrack_expect),
1209 0, 0, NULL, NULL);
1210 if (!ip_conntrack_expect_cachep) {
1211 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1212 goto err_free_conntrack_slab;
1215 /* Don't NEED lock here, but good form anyway. */
1216 WRITE_LOCK(&ip_conntrack_lock);
1217 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1218 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1219 /* Sew in builtin protocols. */
1220 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1221 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1222 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1223 WRITE_UNLOCK(&ip_conntrack_lock);
1225 for (i = 0; i < ip_conntrack_htable_size; i++)
1226 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1228 /* For use by ipt_REJECT */
1229 ip_ct_attach = ip_conntrack_attach;
1231 /* Set up fake conntrack:
1232 - to never be deleted, not in any hashes */
1233 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1234 /* - and look it like as a confirmed connection */
1235 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1237 return ret;
1239 err_free_conntrack_slab:
1240 kmem_cache_destroy(ip_conntrack_cachep);
1241 err_free_hash:
1242 free_conntrack_hash();
1243 err_unreg_sockopt:
1244 nf_unregister_sockopt(&so_getorigdst);
1246 return -ENOMEM;