Merge branch 'tomato-ND-kernel-update' into tomato-ND-USBmod
[tomato.git] / release / src / linux / linux / net / ipv4 / netfilter / ip_conntrack_core.c
blob31484a65834bb517e1c3c87e5b4bd7a376f62579
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
5 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
6 * Public Licence.
8 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
9 * - new API and handling of conntrack/nat helpers
10 * - now capable of multiple expectations for one master
11 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
12 * - add usage/reference counts to ip_conntrack_expect
13 * - export ip_conntrack[_expect]_{find_get,put} functions
14 * */
16 #include <linux/version.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/ip.h>
20 #include <linux/netfilter.h>
21 #include <linux/netfilter_ipv4.h>
22 #include <linux/module.h>
23 #include <linux/skbuff.h>
24 #include <linux/proc_fs.h>
25 #include <linux/vmalloc.h>
26 #include <linux/brlock.h>
27 #include <net/checksum.h>
28 #include <linux/stddef.h>
29 #include <linux/sysctl.h>
30 #include <linux/slab.h>
31 #include <linux/random.h>
32 #include <linux/jhash.h>
33 /* For ERR_PTR(). Yeah, I know... --RR */
34 #include <linux/fs.h>
36 /* This rwlock protects the main hash table, protocol/helper/expected
37 registrations, conntrack timers*/
38 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
39 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
41 #include <linux/netfilter_ipv4/ip_conntrack.h>
42 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
43 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
45 #include <linux/netfilter_ipv4/listhelp.h>
47 #define IP_CONNTRACK_VERSION "2.1"
49 #if 0
50 #define DEBUGP printk
51 #else
52 #define DEBUGP(format, args...)
53 #endif
55 DECLARE_RWLOCK(ip_conntrack_lock);
56 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
58 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
59 LIST_HEAD(ip_conntrack_expect_list);
60 LIST_HEAD(protocol_list);
61 static LIST_HEAD(helpers);
62 unsigned int ip_conntrack_htable_size = 0;
63 int ip_conntrack_max = 0;
64 int ip_conntrack_clear = 0;
65 static int kill_all(struct ip_conntrack *i, void *data);
66 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
67 struct list_head *ip_conntrack_hash;
68 static kmem_cache_t *ip_conntrack_cachep;
69 static LIST_HEAD(unconfirmed);
71 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
73 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
74 u_int8_t protocol)
76 return protocol == curr->proto;
79 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
81 struct ip_conntrack_protocol *p;
83 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
84 p = LIST_FIND(&protocol_list, proto_cmpfn,
85 struct ip_conntrack_protocol *, protocol);
86 if (!p)
87 p = &ip_conntrack_generic_protocol;
89 return p;
92 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
94 struct ip_conntrack_protocol *p;
96 READ_LOCK(&ip_conntrack_lock);
97 p = __ip_ct_find_proto(protocol);
98 READ_UNLOCK(&ip_conntrack_lock);
99 return p;
102 inline void
103 ip_conntrack_put(struct ip_conntrack *ct)
105 IP_NF_ASSERT(ct);
106 IP_NF_ASSERT(ct->infos[0].master);
107 /* nf_conntrack_put wants to go via an info struct, so feed it
108 one at random. */
109 nf_conntrack_put(&ct->infos[0]);
112 static int ip_conntrack_hash_rnd_initted;
113 static unsigned int ip_conntrack_hash_rnd;
115 static u_int32_t
116 hash_conntrack(const struct ip_conntrack_tuple *tuple)
118 #if 0
119 dump_tuple(tuple);
120 #endif
121 return (jhash_3words(tuple->src.ip,
122 (tuple->dst.ip ^ tuple->dst.protonum),
123 (tuple->src.u.all | (tuple->dst.u.all << 16)),
124 /* SpeedMod: Change modulo to AND */
125 //ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
126 ip_conntrack_hash_rnd) & (ip_conntrack_htable_size - 1));
130 inline int
131 ip_ct_get_tuple(const struct iphdr *iph, size_t len,
132 struct ip_conntrack_tuple *tuple,
133 struct ip_conntrack_protocol *protocol)
135 int ret;
137 /* Never happen */
138 if (iph->frag_off & htons(IP_OFFSET)) {
139 printk("ip_conntrack_core: Frag of proto %u.\n",
140 iph->protocol);
141 return 0;
143 /* Guarantee 8 protocol bytes: if more wanted, use len param */
144 else if (iph->ihl * 4 + 8 > len)
145 return 0;
147 tuple->src.ip = iph->saddr;
148 tuple->dst.ip = iph->daddr;
149 tuple->dst.protonum = iph->protocol;
151 tuple->src.u.all = tuple->dst.u.all = 0;
153 ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
154 len - 4*iph->ihl,
155 tuple);
156 return ret;
159 static int
160 invert_tuple(struct ip_conntrack_tuple *inverse,
161 const struct ip_conntrack_tuple *orig,
162 const struct ip_conntrack_protocol *protocol)
164 inverse->src.ip = orig->dst.ip;
165 inverse->dst.ip = orig->src.ip;
166 inverse->dst.protonum = orig->dst.protonum;
168 inverse->src.u.all = inverse->dst.u.all = 0;
170 return protocol->invert_tuple(inverse, orig);
174 /* ip_conntrack_expect helper functions */
176 /* Compare tuple parts depending on mask. */
177 static inline int expect_cmp(const struct ip_conntrack_expect *i,
178 const struct ip_conntrack_tuple *tuple)
180 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
181 return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
184 static void
185 destroy_expect(struct ip_conntrack_expect *exp)
187 DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
188 IP_NF_ASSERT(atomic_read(&exp->use) == 0);
189 IP_NF_ASSERT(!timer_pending(&exp->timeout));
191 kfree(exp);
194 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
196 IP_NF_ASSERT(exp);
198 if (atomic_dec_and_test(&exp->use)) {
199 /* usage count dropped to zero */
200 destroy_expect(exp);
204 static inline struct ip_conntrack_expect *
205 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
207 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
208 MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
209 return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
210 struct ip_conntrack_expect *, tuple);
213 /* Find a expectation corresponding to a tuple. */
214 struct ip_conntrack_expect *
215 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
217 struct ip_conntrack_expect *exp;
219 READ_LOCK(&ip_conntrack_lock);
220 READ_LOCK(&ip_conntrack_expect_tuple_lock);
221 exp = __ip_ct_expect_find(tuple);
222 if (exp)
223 atomic_inc(&exp->use);
224 READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
225 READ_UNLOCK(&ip_conntrack_lock);
227 return exp;
230 /* remove one specific expectation from all lists and drop refcount,
231 * does _NOT_ delete the timer. */
232 static void __unexpect_related(struct ip_conntrack_expect *expect)
234 DEBUGP("unexpect_related(%p)\n", expect);
235 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
237 /* we're not allowed to unexpect a confirmed expectation! */
238 IP_NF_ASSERT(!expect->sibling);
240 /* delete from global and local lists */
241 list_del(&expect->list);
242 list_del(&expect->expected_list);
244 /* decrement expect-count of master conntrack */
245 if (expect->expectant)
246 expect->expectant->expecting--;
248 ip_conntrack_expect_put(expect);
251 /* remove one specific expecatation from all lists, drop refcount
252 * and expire timer.
253 * This function can _NOT_ be called for confirmed expects! */
254 static void unexpect_related(struct ip_conntrack_expect *expect)
256 IP_NF_ASSERT(expect->expectant);
257 /* if we are supposed to have a timer, but we can't delete
258 * it: race condition. __unexpect_related will
259 * be calledd by timeout function */
260 if (expect->expectant->helper
261 && expect->expectant->helper->timeout
262 && !del_timer(&expect->timeout))
263 return;
265 __unexpect_related(expect);
268 /* delete all unconfirmed expectations for this conntrack */
269 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
271 struct list_head *exp_entry, *next;
272 struct ip_conntrack_expect *exp;
274 DEBUGP("remove_expectations(%p)\n", ct);
276 list_for_each_safe(exp_entry, next, &ct->sibling_list) {
277 exp = list_entry(exp_entry, struct ip_conntrack_expect,
278 expected_list);
280 /* we skip established expectations, as we want to delete
281 * the un-established ones only */
282 if (exp->sibling) {
283 DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
284 if (drop_refcount) {
285 /* Indicate that this expectations parent is dead */
286 ip_conntrack_put(exp->expectant);
287 exp->expectant = NULL;
289 continue;
292 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
293 IP_NF_ASSERT(exp->expectant == ct);
295 /* delete expectation from global and private lists */
296 unexpect_related(exp);
300 static void
301 clean_from_lists(struct ip_conntrack *ct)
303 unsigned int ho, hr;
305 DEBUGP("clean_from_lists(%p)\n", ct);
306 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
308 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
309 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
310 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
311 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
313 /* Destroy all un-established, pending expectations */
314 remove_expectations(ct, 1);
317 static void
318 destroy_conntrack(struct nf_conntrack *nfct)
320 struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
321 struct ip_conntrack_protocol *proto;
323 DEBUGP("destroy_conntrack(%p)\n", ct);
324 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
325 IP_NF_ASSERT(!timer_pending(&ct->timeout));
327 /* To make sure we don't get any weird locking issues here:
328 * destroy_conntrack() MUST NOT be called with a write lock
329 * to ip_conntrack_lock!!! -HW */
330 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
331 if (proto && proto->destroy)
332 proto->destroy(ct);
334 if (ip_conntrack_destroyed)
335 ip_conntrack_destroyed(ct);
337 WRITE_LOCK(&ip_conntrack_lock);
338 /* Make sure don't leave any orphaned expectations lying around */
339 if (ct->expecting)
340 remove_expectations(ct, 1);
342 /* We overload first tuple to link into unconfirmed list. */
343 if (!is_confirmed(ct)) {
344 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
345 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
348 /* Delete our master expectation */
349 if (ct->master) {
350 if (ct->master->expectant) {
351 /* can't call __unexpect_related here,
352 * since it would screw up expect_list */
353 list_del(&ct->master->expected_list);
354 master = ct->master->expectant;
356 kfree(ct->master);
359 #if defined(CONFIG_IP_NF_MATCH_LAYER7) || defined(CONFIG_IP_NF_MATCH_LAYER7_MODULE)
360 if(ct->layer7.app_proto)
361 kfree(ct->layer7.app_proto);
362 if(ct->layer7.app_data)
363 kfree(ct->layer7.app_data);
364 #endif
366 WRITE_UNLOCK(&ip_conntrack_lock);
368 if (master)
369 ip_conntrack_put(master);
371 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
372 kmem_cache_free(ip_conntrack_cachep, ct);
373 atomic_dec(&ip_conntrack_count);
376 static void death_by_timeout(unsigned long ul_conntrack)
378 struct ip_conntrack *ct = (void *)ul_conntrack;
380 WRITE_LOCK(&ip_conntrack_lock);
381 clean_from_lists(ct);
382 WRITE_UNLOCK(&ip_conntrack_lock);
383 ip_conntrack_put(ct);
386 static inline int
387 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
388 const struct ip_conntrack_tuple *tuple,
389 const struct ip_conntrack *ignored_conntrack)
391 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
392 return i->ctrack != ignored_conntrack
393 && ip_ct_tuple_equal(tuple, &i->tuple);
396 static struct ip_conntrack_tuple_hash *
397 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
398 const struct ip_conntrack *ignored_conntrack)
400 struct ip_conntrack_tuple_hash *h;
401 unsigned int hash = hash_conntrack(tuple);
403 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
404 h = LIST_FIND(&ip_conntrack_hash[hash],
405 conntrack_tuple_cmp,
406 struct ip_conntrack_tuple_hash *,
407 tuple, ignored_conntrack);
408 return h;
411 /* Find a connection corresponding to a tuple. */
412 struct ip_conntrack_tuple_hash *
413 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
414 const struct ip_conntrack *ignored_conntrack)
416 struct ip_conntrack_tuple_hash *h;
418 READ_LOCK(&ip_conntrack_lock);
419 h = __ip_conntrack_find(tuple, ignored_conntrack);
420 if (h)
421 atomic_inc(&h->ctrack->ct_general.use);
422 READ_UNLOCK(&ip_conntrack_lock);
424 return h;
427 static inline struct ip_conntrack *
428 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
430 struct ip_conntrack *ct
431 = (struct ip_conntrack *)nfct->master;
433 /* ctinfo is the index of the nfct inside the conntrack */
434 *ctinfo = nfct - ct->infos;
435 IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
436 return ct;
439 /* Return conntrack and conntrack_info given skb->nfct->master */
440 struct ip_conntrack *
441 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
443 if (skb->nfct)
444 return __ip_conntrack_get(skb->nfct, ctinfo);
445 return NULL;
448 /* Confirm a connection given skb->nfct; places it in hash table */
450 __ip_conntrack_confirm(struct nf_ct_info *nfct)
452 unsigned int hash, repl_hash;
453 struct ip_conntrack *ct;
454 enum ip_conntrack_info ctinfo;
456 ct = __ip_conntrack_get(nfct, &ctinfo);
458 /* ipt_REJECT uses ip_conntrack_attach to attach related
459 ICMP/TCP RST packets in other direction. Actual packet
460 which created connection will be IP_CT_NEW or for an
461 expected connection, IP_CT_RELATED. */
462 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
463 return NF_ACCEPT;
465 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
466 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
468 /* We're not in hash table, and we refuse to set up related
469 connections for unconfirmed conns. But packet copies and
470 REJECT will give spurious warnings here. */
471 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
473 /* No external references means noone else could have
474 confirmed us. */
475 IP_NF_ASSERT(!is_confirmed(ct));
476 DEBUGP("Confirming conntrack %p\n", ct);
478 WRITE_LOCK(&ip_conntrack_lock);
479 /* See if there's one in the list already, including reverse:
480 NAT could have grabbed it without realizing, since we're
481 not in the hash. If there is, we lost race. */
482 if (!LIST_FIND(&ip_conntrack_hash[hash],
483 conntrack_tuple_cmp,
484 struct ip_conntrack_tuple_hash *,
485 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
486 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
487 conntrack_tuple_cmp,
488 struct ip_conntrack_tuple_hash *,
489 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
490 /* Remove from unconfirmed list */
491 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
493 list_prepend(&ip_conntrack_hash[hash],
494 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
495 list_prepend(&ip_conntrack_hash[repl_hash],
496 &ct->tuplehash[IP_CT_DIR_REPLY]);
497 /* Timer relative to confirmation time, not original
498 setting time, otherwise we'd get timer wrap in
499 weird delay cases. */
500 ct->timeout.expires += jiffies;
501 add_timer(&ct->timeout);
502 atomic_inc(&ct->ct_general.use);
503 set_bit(IPS_CONFIRMED_BIT, &ct->status);
504 WRITE_UNLOCK(&ip_conntrack_lock);
505 return NF_ACCEPT;
508 WRITE_UNLOCK(&ip_conntrack_lock);
509 return NF_DROP;
512 /* Returns true if a connection correspondings to the tuple (required
513 for NAT). */
515 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
516 const struct ip_conntrack *ignored_conntrack)
518 struct ip_conntrack_tuple_hash *h;
520 READ_LOCK(&ip_conntrack_lock);
521 h = __ip_conntrack_find(tuple, ignored_conntrack);
522 READ_UNLOCK(&ip_conntrack_lock);
524 return h != NULL;
527 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
528 struct ip_conntrack *
529 icmp_error_track(struct sk_buff *skb,
530 enum ip_conntrack_info *ctinfo,
531 unsigned int hooknum)
533 const struct iphdr *iph = skb->nh.iph;
534 struct icmphdr *hdr;
535 struct ip_conntrack_tuple innertuple, origtuple;
536 struct iphdr *inner;
537 size_t datalen;
538 struct ip_conntrack_protocol *innerproto;
539 struct ip_conntrack_tuple_hash *h;
541 IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
542 IP_NF_ASSERT(skb->nfct == NULL);
544 hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
545 inner = (struct iphdr *)(hdr + 1);
546 datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
548 if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
549 DEBUGP("icmp_error_track: too short\n");
550 return NULL;
553 if (hdr->type != ICMP_DEST_UNREACH
554 && hdr->type != ICMP_SOURCE_QUENCH
555 && hdr->type != ICMP_TIME_EXCEEDED
556 && hdr->type != ICMP_PARAMETERPROB
557 && hdr->type != ICMP_REDIRECT)
558 return NULL;
560 /* Ignore ICMP's containing fragments (shouldn't happen) */
561 if (inner->frag_off & htons(IP_OFFSET)) {
562 DEBUGP("icmp_error_track: fragment of proto %u\n",
563 inner->protocol);
564 return NULL;
567 /* Ignore it if the checksum's bogus. */
568 if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
569 DEBUGP("icmp_error_track: bad csum\n");
570 return NULL;
573 innerproto = ip_ct_find_proto(inner->protocol);
574 /* Are they talking about one of our connections? */
575 if (inner->ihl * 4 + 8 > datalen
576 || !ip_ct_get_tuple(inner, datalen, &origtuple, innerproto)) {
577 DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
578 inner->protocol, inner->ihl, 8,
579 datalen);
580 return NULL;
583 /* Ordinarily, we'd expect the inverted tupleproto, but it's
584 been preserved inside the ICMP. */
585 if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
586 DEBUGP("icmp_error_track: Can't invert tuple\n");
587 return NULL;
590 *ctinfo = IP_CT_RELATED;
592 h = ip_conntrack_find_get(&innertuple, NULL);
593 if (!h) {
594 /* Locally generated ICMPs will match inverted if they
595 haven't been SNAT'ed yet */
596 /* FIXME: NAT code has to handle half-done double NAT --RR */
597 if (hooknum == NF_IP_LOCAL_OUT)
598 h = ip_conntrack_find_get(&origtuple, NULL);
600 if (!h) {
601 DEBUGP("icmp_error_track: no match\n");
602 return NULL;
604 /* Reverse direction from that found */
605 if (DIRECTION(h) != IP_CT_DIR_REPLY)
606 *ctinfo += IP_CT_IS_REPLY;
607 } else {
608 if (DIRECTION(h) == IP_CT_DIR_REPLY)
609 *ctinfo += IP_CT_IS_REPLY;
612 /* Update skb to refer to this connection */
613 skb->nfct = &h->ctrack->infos[*ctinfo];
614 return h->ctrack;
617 /* There's a small race here where we may free a just-assured
618 connection. Too bad: we're in trouble anyway. */
619 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
621 return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
624 static int early_drop(struct list_head *chain)
626 /* Traverse backwards: gives us oldest, which is roughly LRU */
627 struct ip_conntrack_tuple_hash *h;
628 int dropped = 0;
630 READ_LOCK(&ip_conntrack_lock);
631 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
632 if (h)
633 atomic_inc(&h->ctrack->ct_general.use);
634 READ_UNLOCK(&ip_conntrack_lock);
636 if (!h)
637 return dropped;
639 if (del_timer(&h->ctrack->timeout)) {
640 death_by_timeout((unsigned long)h->ctrack);
641 dropped = 1;
643 ip_conntrack_put(h->ctrack);
644 return dropped;
647 static inline int helper_cmp(const struct ip_conntrack_helper *i,
648 const struct ip_conntrack_tuple *rtuple)
650 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
653 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
655 return LIST_FIND(&helpers, helper_cmp,
656 struct ip_conntrack_helper *,
657 tuple);
660 /* Allocate a new conntrack: we return -ENOMEM if classification
661 failed due to stress. Otherwise it really is unclassifiable. */
662 static struct ip_conntrack_tuple_hash *
663 init_conntrack(const struct ip_conntrack_tuple *tuple,
664 struct ip_conntrack_protocol *protocol,
665 struct sk_buff *skb)
667 struct ip_conntrack *conntrack;
668 struct ip_conntrack_tuple repl_tuple;
669 size_t hash;
670 struct ip_conntrack_expect *expected;
671 int i;
672 static unsigned int drop_next = 0;
674 if (!ip_conntrack_hash_rnd_initted) {
675 get_random_bytes(&ip_conntrack_hash_rnd, 4);
676 ip_conntrack_hash_rnd_initted = 1;
679 hash = hash_conntrack(tuple);
681 if (ip_conntrack_max &&
682 atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
683 /* Try dropping from random chain, or else from the
684 chain about to put into (in case they're trying to
685 bomb one hash chain). */
686 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
688 if (!early_drop(&ip_conntrack_hash[next])
689 && !early_drop(&ip_conntrack_hash[hash])) {
690 if (net_ratelimit())
691 printk(KERN_WARNING
692 "ip_conntrack: table full, dropping"
693 " packet.\n");
694 return ERR_PTR(-ENOMEM);
698 if (!invert_tuple(&repl_tuple, tuple, protocol)) {
699 DEBUGP("Can't invert tuple.\n");
700 return NULL;
703 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
704 if (!conntrack) {
705 DEBUGP("Can't allocate conntrack.\n");
706 return ERR_PTR(-ENOMEM);
709 memset(conntrack, 0, sizeof(*conntrack));
710 atomic_set(&conntrack->ct_general.use, 1);
711 conntrack->ct_general.destroy = destroy_conntrack;
712 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
713 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
714 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
715 conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
716 for (i=0; i < IP_CT_NUMBER; i++)
717 conntrack->infos[i].master = &conntrack->ct_general;
719 if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
720 kmem_cache_free(ip_conntrack_cachep, conntrack);
721 return NULL;
723 /* Don't set timer yet: wait for confirmation */
724 init_timer(&conntrack->timeout);
725 conntrack->timeout.data = (unsigned long)conntrack;
726 conntrack->timeout.function = death_by_timeout;
728 INIT_LIST_HEAD(&conntrack->sibling_list);
730 WRITE_LOCK(&ip_conntrack_lock);
731 /* Need finding and deleting of expected ONLY if we win race */
732 READ_LOCK(&ip_conntrack_expect_tuple_lock);
733 expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
734 struct ip_conntrack_expect *, tuple);
735 READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
737 /* If master is not in hash table yet (ie. packet hasn't left
738 this machine yet), how can other end know about expected?
739 Hence these are not the droids you are looking for (if
740 master ct never got confirmed, we'd hold a reference to it
741 and weird things would happen to future packets). */
742 if (expected && !is_confirmed(expected->expectant))
743 expected = NULL;
745 /* Look up the conntrack helper for master connections only */
746 if (!expected)
747 conntrack->helper = ip_ct_find_helper(&repl_tuple);
749 /* If the expectation is dying, then this is a looser. */
750 if (expected
751 && expected->expectant->helper
752 && expected->expectant->helper->timeout
753 && ! del_timer(&expected->timeout))
754 expected = NULL;
756 if (expected) {
757 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
758 conntrack, expected);
759 /* Welcome, Mr. Bond. We've been expecting you... */
760 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
761 conntrack->master = expected;
762 expected->sibling = conntrack;
763 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
764 conntrack->mark = expected->expectant->mark;
765 #endif
766 LIST_DELETE(&ip_conntrack_expect_list, expected);
767 expected->expectant->expecting--;
768 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
770 /* Overload tuple linked list to put us in unconfirmed list. */
771 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list,
772 &unconfirmed);
774 atomic_inc(&ip_conntrack_count);
775 WRITE_UNLOCK(&ip_conntrack_lock);
777 if (expected && expected->expectfn)
778 expected->expectfn(conntrack);
779 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
782 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
783 static inline struct ip_conntrack *
784 resolve_normal_ct(struct sk_buff *skb,
785 struct ip_conntrack_protocol *proto,
786 int *set_reply,
787 unsigned int hooknum,
788 enum ip_conntrack_info *ctinfo)
790 struct ip_conntrack_tuple tuple;
791 struct ip_conntrack_tuple_hash *h;
793 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
795 if (!ip_ct_get_tuple(skb->nh.iph, skb->len, &tuple, proto))
796 return NULL;
798 /* look for tuple match */
799 h = ip_conntrack_find_get(&tuple, NULL);
800 if (!h) {
801 h = init_conntrack(&tuple, proto, skb);
802 if (!h)
803 return NULL;
804 if (IS_ERR(h))
805 return (void *)h;
808 /* It exists; we have (non-exclusive) reference. */
809 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
810 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
811 /* Please set reply bit if this packet OK */
812 *set_reply = 1;
813 } else {
814 /* Once we've had two way comms, always ESTABLISHED. */
815 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
816 DEBUGP("ip_conntrack_in: normal packet for %p\n",
817 h->ctrack);
818 *ctinfo = IP_CT_ESTABLISHED;
819 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
820 DEBUGP("ip_conntrack_in: related packet for %p\n",
821 h->ctrack);
822 *ctinfo = IP_CT_RELATED;
823 } else {
824 DEBUGP("ip_conntrack_in: new packet for %p\n",
825 h->ctrack);
826 *ctinfo = IP_CT_NEW;
828 *set_reply = 0;
830 skb->nfct = &h->ctrack->infos[*ctinfo];
831 return h->ctrack;
834 /* Netfilter hook itself. */
835 unsigned int ip_conntrack_in(unsigned int hooknum,
836 struct sk_buff **pskb,
837 const struct net_device *in,
838 const struct net_device *out,
839 int (*okfn)(struct sk_buff *))
841 struct ip_conntrack *ct;
842 enum ip_conntrack_info ctinfo;
843 struct ip_conntrack_protocol *proto;
844 int set_reply;
845 int ret;
847 if (ip_conntrack_clear != 0)
849 ip_ct_iterate_cleanup(kill_all, NULL);
850 ip_conntrack_clear = 0;
853 /* FIXME: Do this right please. --RR */
854 (*pskb)->nfcache |= NFC_UNKNOWN;
856 /* Doesn't cover locally-generated broadcast, so not worth it. */
857 #if 0
858 /* Ignore broadcast: no `connection'. */
859 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
860 printk("Broadcast packet!\n");
861 return NF_ACCEPT;
862 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
863 == htonl(0x000000FF)) {
864 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
865 NIPQUAD((*pskb)->nh.iph->saddr),
866 NIPQUAD((*pskb)->nh.iph->daddr),
867 (*pskb)->sk, (*pskb)->pkt_type);
869 #endif
871 /* Previously seen (loopback)? Ignore. Do this before
872 fragment check. */
873 if ((*pskb)->nfct)
874 return NF_ACCEPT;
876 /* Gather fragments. */
877 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
878 *pskb = ip_ct_gather_frags(*pskb,
879 hooknum == NF_IP_PRE_ROUTING ?
880 IP_DEFRAG_CONNTRACK_IN :
881 IP_DEFRAG_CONNTRACK_OUT);
882 if (!*pskb)
883 return NF_STOLEN;
886 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
888 /* It may be an icmp error... */
889 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
890 && icmp_error_track(*pskb, &ctinfo, hooknum))
891 return NF_ACCEPT;
893 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
894 /* Not valid part of a connection */
895 return NF_ACCEPT;
897 if (IS_ERR(ct))
898 /* Too stressed to deal. */
899 return NF_DROP;
901 IP_NF_ASSERT((*pskb)->nfct);
903 ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
904 if (ret == -1) {
905 /* Invalid */
906 nf_conntrack_put((*pskb)->nfct);
907 (*pskb)->nfct = NULL;
908 return NF_ACCEPT;
911 if (ret != NF_DROP && ct->helper) {
912 ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
913 ct, ctinfo);
914 if (ret == -1) {
915 /* Invalid */
916 nf_conntrack_put((*pskb)->nfct);
917 (*pskb)->nfct = NULL;
918 return NF_ACCEPT;
921 if (set_reply)
922 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
924 return ret;
927 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
928 const struct ip_conntrack_tuple *orig)
930 return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
933 static inline int resent_expect(const struct ip_conntrack_expect *i,
934 const struct ip_conntrack_tuple *tuple,
935 const struct ip_conntrack_tuple *mask)
937 DEBUGP("resent_expect\n");
938 DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
939 DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
940 DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
941 return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
942 || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
943 && ip_ct_tuple_equal(&i->mask, mask));
946 /* Would two expected things clash? */
947 static inline int expect_clash(const struct ip_conntrack_expect *i,
948 const struct ip_conntrack_tuple *tuple,
949 const struct ip_conntrack_tuple *mask)
951 /* Part covered by intersection of masks must be unequal,
952 otherwise they clash */
953 struct ip_conntrack_tuple intersect_mask
954 = { { i->mask.src.ip & mask->src.ip,
955 { i->mask.src.u.all & mask->src.u.all } },
956 { i->mask.dst.ip & mask->dst.ip,
957 { i->mask.dst.u.all & mask->dst.u.all },
958 i->mask.dst.protonum & mask->dst.protonum } };
960 return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
963 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
965 WRITE_LOCK(&ip_conntrack_lock);
966 unexpect_related(expect);
967 WRITE_UNLOCK(&ip_conntrack_lock);
970 static void expectation_timed_out(unsigned long ul_expect)
972 struct ip_conntrack_expect *expect = (void *) ul_expect;
974 DEBUGP("expectation %p timed out\n", expect);
975 WRITE_LOCK(&ip_conntrack_lock);
976 __unexpect_related(expect);
977 WRITE_UNLOCK(&ip_conntrack_lock);
980 /* Add a related connection. */
981 int ip_conntrack_expect_related(struct ip_conntrack *related_to,
982 struct ip_conntrack_expect *expect)
984 struct ip_conntrack_expect *old, *new;
985 int ret = 0;
987 WRITE_LOCK(&ip_conntrack_lock);
988 /* Because of the write lock, no reader can walk the lists,
989 * so there is no need to use the tuple lock too */
991 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
992 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
993 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
995 old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
996 struct ip_conntrack_expect *, &expect->tuple,
997 &expect->mask);
998 if (old) {
999 /* Helper private data may contain offsets but no pointers
1000 pointing into the payload - otherwise we should have to copy
1001 the data filled out by the helper over the old one */
1002 DEBUGP("expect_related: resent packet\n");
1003 if (old->expectant == related_to &&
1004 related_to->helper && related_to->helper->timeout) {
1005 if (!del_timer(&old->timeout)) {
1006 /* expectation is dying. Fall through */
1007 old = NULL;
1008 } else {
1009 old->timeout.expires = jiffies +
1010 related_to->helper->timeout * HZ;
1011 add_timer(&old->timeout);
1015 if (old) {
1016 WRITE_UNLOCK(&ip_conntrack_lock);
1017 return -EEXIST;
1019 } else if (related_to->helper && related_to->helper->max_expected &&
1020 related_to->expecting >= related_to->helper->max_expected) {
1021 /* old == NULL */
1022 if (!(related_to->helper->flags &
1023 IP_CT_HELPER_F_REUSE_EXPECT)) {
1024 WRITE_UNLOCK(&ip_conntrack_lock);
1025 if (net_ratelimit())
1026 printk(KERN_WARNING
1027 "ip_conntrack: max number of expected "
1028 "connections %i of %s reached for "
1029 "%u.%u.%u.%u->%u.%u.%u.%u\n",
1030 related_to->helper->max_expected,
1031 related_to->helper->name,
1032 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1033 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1034 return -EPERM;
1036 DEBUGP("ip_conntrack: max number of expected "
1037 "connections %i of %s reached for "
1038 "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1039 related_to->helper->max_expected,
1040 related_to->helper->name,
1041 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1042 NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1044 /* choose the the oldest expectation to evict */
1045 list_for_each_entry(old, &related_to->sibling_list,
1046 expected_list)
1047 if (old->sibling == NULL)
1048 break;
1050 /* We cannot fail since related_to->expecting is the number
1051 * of unconfirmed expectations */
1052 IP_NF_ASSERT(old && old->sibling == NULL);
1054 /* newnat14 does not reuse the real allocated memory
1055 * structures but rather unexpects the old and
1056 * allocates a new. unexpect_related will decrement
1057 * related_to->expecting.
1059 unexpect_related(old);
1060 ret = -EPERM;
1061 } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1062 struct ip_conntrack_expect *, &expect->tuple,
1063 &expect->mask)) {
1064 WRITE_UNLOCK(&ip_conntrack_lock);
1065 DEBUGP("expect_related: busy!\n");
1066 return -EBUSY;
1069 new = (struct ip_conntrack_expect *)
1070 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1071 if (!new) {
1072 WRITE_UNLOCK(&ip_conntrack_lock);
1073 DEBUGP("expect_relaed: OOM allocating expect\n");
1074 return -ENOMEM;
1077 DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1078 memcpy(new, expect, sizeof(*expect));
1079 new->expectant = related_to;
1080 new->sibling = NULL;
1081 atomic_set(&new->use, 1);
1083 /* add to expected list for this connection */
1084 list_add_tail(&new->expected_list, &related_to->sibling_list);
1085 /* add to global list of expectations */
1086 list_prepend(&ip_conntrack_expect_list, &new->list);
1087 /* add and start timer if required */
1088 if (related_to->helper &&
1089 related_to->helper->timeout) {
1090 init_timer(&new->timeout);
1091 new->timeout.data = (unsigned long)new;
1092 new->timeout.function = expectation_timed_out;
1093 new->timeout.expires = jiffies +
1094 related_to->helper->timeout * HZ;
1095 add_timer(&new->timeout);
1097 related_to->expecting++;
1099 WRITE_UNLOCK(&ip_conntrack_lock);
1101 return ret;
1104 /* Change tuple in an existing expectation */
1105 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1106 struct ip_conntrack_tuple *newtuple)
1108 int ret;
1110 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1111 WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1113 DEBUGP("change_expect:\n");
1114 DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1115 DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
1116 DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
1117 if (expect->ct_tuple.dst.protonum == 0) {
1118 /* Never seen before */
1119 DEBUGP("change expect: never seen before\n");
1120 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
1121 && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1122 struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1123 /* Force NAT to find an unused tuple */
1124 ret = -1;
1125 } else {
1126 memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1127 memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1128 ret = 0;
1130 } else {
1131 /* Resent packet */
1132 DEBUGP("change expect: resent packet\n");
1133 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1134 ret = 0;
1135 } else {
1136 /* Force NAT to choose again the same port */
1137 ret = -1;
1140 WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1142 return ret;
1145 /* Alter reply tuple (maybe alter helper). If it's already taken,
1146 return 0 and don't do alteration. */
1147 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1148 const struct ip_conntrack_tuple *newreply)
1150 WRITE_LOCK(&ip_conntrack_lock);
1151 if (__ip_conntrack_find(newreply, conntrack)) {
1152 WRITE_UNLOCK(&ip_conntrack_lock);
1153 return 0;
1155 /* Should be unconfirmed, so not in hash table yet */
1156 IP_NF_ASSERT(!is_confirmed(conntrack));
1158 DEBUGP("Altering reply tuple of %p to ", conntrack);
1159 DUMP_TUPLE(newreply);
1161 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1162 if (!conntrack->master && list_empty(&conntrack->sibling_list))
1163 conntrack->helper = ip_ct_find_helper(newreply);
1164 WRITE_UNLOCK(&ip_conntrack_lock);
1166 return 1;
1169 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1171 MOD_INC_USE_COUNT;
1173 WRITE_LOCK(&ip_conntrack_lock);
1174 list_prepend(&helpers, me);
1175 WRITE_UNLOCK(&ip_conntrack_lock);
1177 return 0;
1180 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1181 const struct ip_conntrack_helper *me)
1183 if (i->ctrack->helper == me) {
1184 /* Get rid of any expected. */
1185 remove_expectations(i->ctrack, 0);
1186 /* And *then* set helper to NULL */
1187 i->ctrack->helper = NULL;
1189 return 0;
1192 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1194 unsigned int i;
1196 /* Need write lock here, to delete helper. */
1197 WRITE_LOCK(&ip_conntrack_lock);
1198 LIST_DELETE(&helpers, me);
1200 /* Get rid of expecteds, set helpers to NULL. */
1201 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1202 for (i = 0; i < ip_conntrack_htable_size; i++)
1203 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1204 struct ip_conntrack_tuple_hash *, me);
1205 WRITE_UNLOCK(&ip_conntrack_lock);
1207 /* Someone could be still looking at the helper in a bh. */
1208 br_write_lock_bh(BR_NETPROTO_LOCK);
1209 br_write_unlock_bh(BR_NETPROTO_LOCK);
1211 MOD_DEC_USE_COUNT;
1213 static inline void ct_add_counters(struct ip_conntrack *ct,
1214 enum ip_conntrack_info ctinfo,
1215 const struct iphdr *iph)
1217 #if defined(CONFIG_IP_NF_CT_ACCT) || \
1218 defined(CONFIG_IP_NF_CT_ACCT_MODULE)
1219 if (iph) {
1220 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1221 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1222 ntohs(iph->tot_len);
1224 #endif
1227 /* Refresh conntrack for this many jiffies. */
1228 void ip_ct_refresh_acct(struct ip_conntrack *ct,
1229 enum ip_conntrack_info ctinfo,
1230 const struct iphdr *iph,
1231 unsigned long extra_jiffies)
1233 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1235 WRITE_LOCK(&ip_conntrack_lock);
1236 /* If not in hash table, timer will not be active yet */
1237 if (!is_confirmed(ct)) {
1238 ct->timeout.expires = extra_jiffies;
1239 ct_add_counters(ct, ctinfo,iph);
1240 } else {
1241 /* Need del_timer for race avoidance (may already be dying). */
1242 if (del_timer(&ct->timeout)) {
1243 ct->timeout.expires = jiffies + extra_jiffies;
1244 add_timer(&ct->timeout);
1246 ct_add_counters(ct, ctinfo, iph);
1248 WRITE_UNLOCK(&ip_conntrack_lock);
1251 /* Returns new sk_buff, or NULL */
1252 struct sk_buff *
1253 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1255 struct sock *sk = skb->sk;
1256 #ifdef CONFIG_NETFILTER_DEBUG
1257 unsigned int olddebug = skb->nf_debug;
1258 #endif
1260 if (sk) {
1261 sock_hold(sk);
1262 skb_orphan(skb);
1265 local_bh_disable();
1266 skb = ip_defrag(skb, user);
1267 local_bh_enable();
1269 if (!skb) {
1270 if (sk) sock_put(sk);
1271 return skb;
1272 } else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
1273 kfree_skb(skb);
1274 if (sk) sock_put(sk);
1275 return NULL;
1278 if (sk) {
1279 skb_set_owner_w(skb, sk);
1280 sock_put(sk);
1283 ip_send_check(skb->nh.iph);
1284 skb->nfcache |= NFC_ALTERED;
1285 #ifdef CONFIG_NETFILTER_DEBUG
1286 /* Packet path as if nothing had happened. */
1287 skb->nf_debug = olddebug;
1288 #endif
1289 return skb;
1292 /* Used by ipt_REJECT. */
1293 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1295 struct ip_conntrack *ct;
1296 enum ip_conntrack_info ctinfo;
1298 ct = __ip_conntrack_get(nfct, &ctinfo);
1300 /* This ICMP is in reverse direction to the packet which
1301 caused it */
1302 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1303 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1304 else
1305 ctinfo = IP_CT_RELATED;
1307 /* Attach new skbuff, and increment count */
1308 nskb->nfct = &ct->infos[ctinfo];
1309 atomic_inc(&ct->ct_general.use);
1312 static inline int
1313 do_iter(const struct ip_conntrack_tuple_hash *i,
1314 int (*iter)(struct ip_conntrack *i, void *data),
1315 void *data)
1317 return iter(i->ctrack, data);
1320 /* Bring out ya dead! */
1321 static struct ip_conntrack_tuple_hash *
1322 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1323 void *data, unsigned int *bucket)
1325 struct ip_conntrack_tuple_hash *h = NULL;
1327 WRITE_LOCK(&ip_conntrack_lock);
1328 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1329 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1330 struct ip_conntrack_tuple_hash *, iter, data);
1331 if (h)
1332 break;
1334 if (!h)
1335 h = LIST_FIND_W(&unconfirmed, do_iter,
1336 struct ip_conntrack_tuple_hash *, iter, data);
1337 if (h)
1338 atomic_inc(&h->ctrack->ct_general.use);
1339 WRITE_UNLOCK(&ip_conntrack_lock);
1341 return h;
1344 void
1345 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1347 struct ip_conntrack_tuple_hash *h;
1348 unsigned int bucket = 0;
1350 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1351 /* Time to push up daises... */
1352 if (del_timer(&h->ctrack->timeout))
1353 death_by_timeout((unsigned long)h->ctrack);
1354 /* ... else the timer will get him soon. */
1356 ip_conntrack_put(h->ctrack);
1360 /* Fast function for those who don't want to parse /proc (and I don't
1361 blame them). */
1362 /* Reversing the socket's dst/src point of view gives us the reply
1363 mapping. */
1364 static int
1365 getorigdst(struct sock *sk, int optval, void *user, int *len)
1367 struct ip_conntrack_tuple_hash *h;
1368 struct ip_conntrack_tuple tuple;
1370 IP_CT_TUPLE_U_BLANK(&tuple);
1371 tuple.src.ip = sk->rcv_saddr;
1372 tuple.src.u.tcp.port = sk->sport;
1373 tuple.dst.ip = sk->daddr;
1374 tuple.dst.u.tcp.port = sk->dport;
1375 tuple.dst.protonum = IPPROTO_TCP;
1377 /* We only do TCP at the moment: is there a better way? */
1378 if (strcmp(sk->prot->name, "TCP") != 0) {
1379 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1380 return -ENOPROTOOPT;
1383 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1384 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1385 *len, sizeof(struct sockaddr_in));
1386 return -EINVAL;
1389 h = ip_conntrack_find_get(&tuple, NULL);
1390 if (h) {
1391 struct sockaddr_in sin;
1393 sin.sin_family = AF_INET;
1394 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1395 .tuple.dst.u.tcp.port;
1396 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1397 .tuple.dst.ip;
1398 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1400 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1401 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1402 ip_conntrack_put(h->ctrack);
1403 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1404 return -EFAULT;
1405 else
1406 return 0;
1408 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1409 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1410 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1411 return -ENOENT;
1414 static struct nf_sockopt_ops so_getorigdst
1415 = { { NULL, NULL }, PF_INET,
1416 0, 0, NULL, /* Setsockopts */
1417 SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1418 0, NULL };
1420 static int kill_all(struct ip_conntrack *i, void *data)
1422 return 1;
1425 /* Mishearing the voices in his head, our hero wonders how he's
1426 supposed to kill the mall. */
1427 void ip_conntrack_cleanup(void)
1429 ip_ct_attach = NULL;
1430 /* This makes sure all current packets have passed through
1431 netfilter framework. Roll on, two-stage module
1432 delete... */
1433 br_write_lock_bh(BR_NETPROTO_LOCK);
1434 br_write_unlock_bh(BR_NETPROTO_LOCK);
1436 i_see_dead_people:
1437 ip_ct_iterate_cleanup(kill_all, NULL);
1438 if (atomic_read(&ip_conntrack_count) != 0) {
1439 schedule();
1440 goto i_see_dead_people;
1443 kmem_cache_destroy(ip_conntrack_cachep);
1444 vfree(ip_conntrack_hash);
1445 nf_unregister_sockopt(&so_getorigdst);
1448 static int hashsize = 0;
1449 MODULE_PARM(hashsize, "i");
1451 int __init ip_conntrack_init(void)
1453 unsigned int i;
1454 int ret;
1456 /* SpeedMod: Hashtable size */
1457 #if 0
1458 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1459 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1460 if (hashsize) {
1461 ip_conntrack_htable_size = hashsize;
1462 } else {
1463 ip_conntrack_htable_size
1464 = (((num_physpages << PAGE_SHIFT) / 16384)
1465 / sizeof(struct list_head));
1466 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1467 ip_conntrack_htable_size = 8192;
1468 if (ip_conntrack_htable_size < 16)
1469 ip_conntrack_htable_size = 16;
1471 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1472 #else
1473 ip_conntrack_htable_size = 16384;
1474 ip_conntrack_max = 8192;
1475 #endif
1477 printk("ip_conntrack version %s (%u buckets, %d max)"
1478 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1479 ip_conntrack_htable_size, ip_conntrack_max,
1480 sizeof(struct ip_conntrack));
1482 ret = nf_register_sockopt(&so_getorigdst);
1483 if (ret != 0) {
1484 printk(KERN_ERR "Unable to register netfilter socket option\n");
1485 return ret;
1488 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1489 * ip_conntrack_htable_size);
1490 if (!ip_conntrack_hash) {
1491 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1492 goto err_unreg_sockopt;
1495 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1496 sizeof(struct ip_conntrack), 0,
1497 SLAB_HWCACHE_ALIGN, NULL, NULL);
1498 if (!ip_conntrack_cachep) {
1499 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1500 goto err_free_hash;
1502 /* Don't NEED lock here, but good form anyway. */
1503 WRITE_LOCK(&ip_conntrack_lock);
1504 /* Sew in builtin protocols. */
1505 list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1506 list_append(&protocol_list, &ip_conntrack_protocol_udp);
1507 list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1508 WRITE_UNLOCK(&ip_conntrack_lock);
1510 for (i = 0; i < ip_conntrack_htable_size; i++)
1511 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1513 /* For use by ipt_REJECT */
1514 ip_ct_attach = ip_conntrack_attach;
1515 return ret;
1517 err_free_hash:
1518 vfree(ip_conntrack_hash);
1519 err_unreg_sockopt:
1520 nf_unregister_sockopt(&so_getorigdst);
1522 return -ENOMEM;