Linux-2.6.12-rc2
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / netfilter / ip_nat_core.c
blob162ceacfc29a86f8f0f44886a5c5f9d4c4601f30
1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
18 #include <net/icmp.h>
19 #include <net/ip.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23 #include <linux/jhash.h>
25 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
26 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
28 #include <linux/netfilter_ipv4/ip_conntrack.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
30 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
31 #include <linux/netfilter_ipv4/ip_nat.h>
32 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
33 #include <linux/netfilter_ipv4/ip_nat_core.h>
34 #include <linux/netfilter_ipv4/ip_nat_helper.h>
35 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36 #include <linux/netfilter_ipv4/listhelp.h>
38 #if 0
39 #define DEBUGP printk
40 #else
41 #define DEBUGP(format, args...)
42 #endif
44 DECLARE_RWLOCK(ip_nat_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
50 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
53 /* We keep an extra hash for each conntrack, for fast searching. */
54 static inline unsigned int
55 hash_by_src(const struct ip_conntrack_tuple *tuple)
57 /* Original src, to ensure we map it consistently if poss. */
58 return jhash_3words(tuple->src.ip, tuple->src.u.all,
59 tuple->dst.protonum, 0) % ip_nat_htable_size;
62 /* Noone using conntrack by the time this called. */
63 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return;
68 WRITE_LOCK(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock);
73 /* We do checksum mangling, so if they were wrong before they're still
74 * wrong. Also works for incomplete packets (eg. ICMP dest
75 * unreachables.) */
76 u_int16_t
77 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
79 u_int32_t diffs[] = { oldvalinv, newval };
80 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
81 oldcheck^0xFFFF));
84 /* Is this tuple already taken? (not by us) */
85 int
86 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
87 const struct ip_conntrack *ignored_conntrack)
89 /* Conntrack tracking doesn't keep track of outgoing tuples; only
90 incoming ones. NAT means they don't have a fixed mapping,
91 so we invert the tuple and look for the incoming reply.
93 We could keep a separate hash if this proves too slow. */
94 struct ip_conntrack_tuple reply;
96 invert_tuplepr(&reply, tuple);
97 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
100 /* If we source map this tuple so reply looks like reply_tuple, will
101 * that meet the constraints of range. */
102 static int
103 in_range(const struct ip_conntrack_tuple *tuple,
104 const struct ip_nat_range *range)
106 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
108 /* If we are supposed to map IPs, then we must be in the
109 range specified, otherwise let this drag us onto a new src IP. */
110 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
111 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
112 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
113 return 0;
116 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
117 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
118 &range->min, &range->max))
119 return 1;
121 return 0;
124 static inline int
125 same_src(const struct ip_conntrack *ct,
126 const struct ip_conntrack_tuple *tuple)
128 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
129 == tuple->dst.protonum
130 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
131 == tuple->src.ip
132 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
133 == tuple->src.u.all);
136 /* Only called for SRC manip */
137 static int
138 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
139 struct ip_conntrack_tuple *result,
140 const struct ip_nat_range *range)
142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct;
145 READ_LOCK(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */
149 invert_tuplepr(result,
150 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
151 result->dst = tuple->dst;
153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock);
155 return 1;
159 READ_UNLOCK(&ip_nat_lock);
160 return 0;
163 /* For [FUTURE] fragmentation handling, we want the least-used
164 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
165 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
166 1-65535, we don't do pro-rata allocation based on ports; we choose
167 the ip with the lowest src-ip/dst-ip/proto usage.
169 static void
170 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
171 const struct ip_nat_range *range,
172 const struct ip_conntrack *conntrack,
173 enum ip_nat_manip_type maniptype)
175 u_int32_t *var_ipp;
176 /* Host order */
177 u_int32_t minip, maxip, j;
179 /* No IP mapping? Do nothing. */
180 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
181 return;
183 if (maniptype == IP_NAT_MANIP_SRC)
184 var_ipp = &tuple->src.ip;
185 else
186 var_ipp = &tuple->dst.ip;
188 /* Fast path: only one choice. */
189 if (range->min_ip == range->max_ip) {
190 *var_ipp = range->min_ip;
191 return;
194 /* Hashing source and destination IPs gives a fairly even
195 * spread in practice (if there are a small number of IPs
196 * involved, there usually aren't that many connections
197 * anyway). The consistency means that servers see the same
198 * client coming from the same IP (some Internet Banking sites
199 * like this), even across reboots. */
200 minip = ntohl(range->min_ip);
201 maxip = ntohl(range->max_ip);
202 j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
203 *var_ipp = htonl(minip + j % (maxip - minip + 1));
206 /* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
207 * we change the source to map into the range. For NF_IP_PRE_ROUTING
208 * and NF_IP_LOCAL_OUT, we change the destination to map into the
209 * range. It might not be possible to get a unique tuple, but we try.
210 * At worst (or if we race), we will end up with a final duplicate in
211 * __ip_conntrack_confirm and drop the packet. */
212 static void
213 get_unique_tuple(struct ip_conntrack_tuple *tuple,
214 const struct ip_conntrack_tuple *orig_tuple,
215 const struct ip_nat_range *range,
216 struct ip_conntrack *conntrack,
217 enum ip_nat_manip_type maniptype)
219 struct ip_nat_protocol *proto
220 = ip_nat_find_proto(orig_tuple->dst.protonum);
222 /* 1) If this srcip/proto/src-proto-part is currently mapped,
223 and that same mapping gives a unique tuple within the given
224 range, use that.
226 This is only required for source (ie. NAT/masq) mappings.
227 So far, we don't do local source mappings, so multiple
228 manips not an issue. */
229 if (maniptype == IP_NAT_MANIP_SRC) {
230 if (find_appropriate_src(orig_tuple, tuple, range)) {
231 DEBUGP("get_unique_tuple: Found current src map\n");
232 if (!ip_nat_used_tuple(tuple, conntrack))
233 return;
237 /* 2) Select the least-used IP/proto combination in the given
238 range. */
239 *tuple = *orig_tuple;
240 find_best_ips_proto(tuple, range, conntrack, maniptype);
242 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */
245 /* Only bother mapping if it's not already in range and unique */
246 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247 || proto->in_range(tuple, maniptype, &range->min, &range->max))
248 && !ip_nat_used_tuple(tuple, conntrack))
249 return;
251 /* Last change: get protocol to try to obtain unique tuple. */
252 proto->unique_tuple(tuple, range, maniptype, conntrack);
255 unsigned int
256 ip_nat_setup_info(struct ip_conntrack *conntrack,
257 const struct ip_nat_range *range,
258 unsigned int hooknum)
260 struct ip_conntrack_tuple curr_tuple, new_tuple;
261 struct ip_nat_info *info = &conntrack->nat.info;
262 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
263 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
265 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
266 || hooknum == NF_IP_POST_ROUTING
267 || hooknum == NF_IP_LOCAL_IN
268 || hooknum == NF_IP_LOCAL_OUT);
269 BUG_ON(ip_nat_initialized(conntrack, maniptype));
271 /* What we've got will look like inverse of reply. Normally
272 this is what is in the conntrack, except for prior
273 manipulations (future optimization: if num_manips == 0,
274 orig_tp =
275 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
276 invert_tuplepr(&curr_tuple,
277 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
279 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
281 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
282 struct ip_conntrack_tuple reply;
284 /* Alter conntrack table so will recognize replies. */
285 invert_tuplepr(&reply, &new_tuple);
286 ip_conntrack_alter_reply(conntrack, &reply);
288 /* Non-atomic: we own this at the moment. */
289 if (maniptype == IP_NAT_MANIP_SRC)
290 conntrack->status |= IPS_SRC_NAT;
291 else
292 conntrack->status |= IPS_DST_NAT;
295 /* Place in source hash if this is the first time. */
296 if (have_to_hash) {
297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple);
300 WRITE_LOCK(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock);
305 /* It's done. */
306 if (maniptype == IP_NAT_MANIP_DST)
307 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
308 else
309 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
311 return NF_ACCEPT;
314 /* Returns true if succeeded. */
315 static int
316 manip_pkt(u_int16_t proto,
317 struct sk_buff **pskb,
318 unsigned int iphdroff,
319 const struct ip_conntrack_tuple *target,
320 enum ip_nat_manip_type maniptype)
322 struct iphdr *iph;
324 (*pskb)->nfcache |= NFC_ALTERED;
325 if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
326 return 0;
328 iph = (void *)(*pskb)->data + iphdroff;
330 /* Manipulate protcol part. */
331 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
332 target, maniptype))
333 return 0;
335 iph = (void *)(*pskb)->data + iphdroff;
337 if (maniptype == IP_NAT_MANIP_SRC) {
338 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
339 iph->check);
340 iph->saddr = target->src.ip;
341 } else {
342 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
343 iph->check);
344 iph->daddr = target->dst.ip;
346 return 1;
349 /* Do packet manipulations according to ip_nat_setup_info. */
350 unsigned int nat_packet(struct ip_conntrack *ct,
351 enum ip_conntrack_info ctinfo,
352 unsigned int hooknum,
353 struct sk_buff **pskb)
355 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
356 unsigned long statusbit;
357 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
359 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
360 && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
361 DEBUGP("ip_nat_core: adjusting sequence number\n");
362 /* future: put this in a l4-proto specific function,
363 * and call this function here. */
364 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
365 return NF_DROP;
368 if (mtype == IP_NAT_MANIP_SRC)
369 statusbit = IPS_SRC_NAT;
370 else
371 statusbit = IPS_DST_NAT;
373 /* Invert if this is reply dir. */
374 if (dir == IP_CT_DIR_REPLY)
375 statusbit ^= IPS_NAT_MASK;
377 /* Non-atomic: these bits don't change. */
378 if (ct->status & statusbit) {
379 struct ip_conntrack_tuple target;
381 /* We are aiming to look like inverse of other direction. */
382 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
384 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
385 return NF_DROP;
387 return NF_ACCEPT;
390 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
391 int icmp_reply_translation(struct sk_buff **pskb,
392 struct ip_conntrack *ct,
393 enum ip_nat_manip_type manip,
394 enum ip_conntrack_dir dir)
396 struct {
397 struct icmphdr icmp;
398 struct iphdr ip;
399 } *inside;
400 struct ip_conntrack_tuple inner, target;
401 int hdrlen = (*pskb)->nh.iph->ihl * 4;
403 if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
404 return 0;
406 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
408 /* We're actually going to mangle it beyond trivial checksum
409 adjustment, so make sure the current checksum is correct. */
410 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
411 hdrlen = (*pskb)->nh.iph->ihl * 4;
412 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
413 (*pskb)->len - hdrlen, 0)))
414 return 0;
417 /* Must be RELATED */
418 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
419 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
421 /* Redirects on non-null nats must be dropped, else they'll
422 start talking to each other without our translation, and be
423 confused... --RR */
424 if (inside->icmp.type == ICMP_REDIRECT) {
425 /* If NAT isn't finished, assume it and drop. */
426 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
427 return 0;
429 if (ct->status & IPS_NAT_MASK)
430 return 0;
433 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
434 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
436 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
437 sizeof(struct icmphdr) + inside->ip.ihl*4,
438 &inner, ip_ct_find_proto(inside->ip.protocol)))
439 return 0;
441 /* Change inner back to look like incoming packet. We do the
442 opposite manip on this hook to normal, because it might not
443 pass all hooks (locally-generated ICMP). Consider incoming
444 packet: PREROUTING (DST manip), routing produces ICMP, goes
445 through POSTROUTING (which must correct the DST manip). */
446 if (!manip_pkt(inside->ip.protocol, pskb,
447 (*pskb)->nh.iph->ihl*4
448 + sizeof(inside->icmp),
449 &ct->tuplehash[!dir].tuple,
450 !manip))
451 return 0;
453 /* Reloading "inside" here since manip_pkt inner. */
454 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
455 inside->icmp.checksum = 0;
456 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
457 (*pskb)->len - hdrlen,
458 0));
460 /* Change outer to look the reply to an incoming packet
461 * (proto 0 means don't invert per-proto part). */
463 /* Obviously, we need to NAT destination IP, but source IP
464 should be NAT'ed only if it is from a NAT'd host.
466 Explanation: some people use NAT for anonymizing. Also,
467 CERT recommends dropping all packets from private IP
468 addresses (although ICMP errors from internal links with
469 such addresses are not too uncommon, as Alan Cox points
470 out) */
471 if (manip != IP_NAT_MANIP_SRC
472 || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
473 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
474 if (!manip_pkt(0, pskb, 0, &target, manip))
475 return 0;
478 return 1;
481 /* Protocol registration. */
482 int ip_nat_protocol_register(struct ip_nat_protocol *proto)
484 int ret = 0;
486 WRITE_LOCK(&ip_nat_lock);
487 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
488 ret = -EBUSY;
489 goto out;
491 ip_nat_protos[proto->protonum] = proto;
492 out:
493 WRITE_UNLOCK(&ip_nat_lock);
494 return ret;
497 /* Noone stores the protocol anywhere; simply delete it. */
498 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
500 WRITE_LOCK(&ip_nat_lock);
501 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
502 WRITE_UNLOCK(&ip_nat_lock);
504 /* Someone could be still looking at the proto in a bh. */
505 synchronize_net();
508 int __init ip_nat_init(void)
510 size_t i;
512 /* Leave them the same for the moment. */
513 ip_nat_htable_size = ip_conntrack_htable_size;
515 /* One vmalloc for both hash tables */
516 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
517 if (!bysource)
518 return -ENOMEM;
520 /* Sew in builtin protocols. */
521 WRITE_LOCK(&ip_nat_lock);
522 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
523 ip_nat_protos[i] = &ip_nat_unknown_protocol;
524 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
525 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
526 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
527 WRITE_UNLOCK(&ip_nat_lock);
529 for (i = 0; i < ip_nat_htable_size; i++) {
530 INIT_LIST_HEAD(&bysource[i]);
533 /* FIXME: Man, this is a hack. <SIGH> */
534 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
535 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
537 /* Initialize fake conntrack so that NAT will skip it */
538 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
539 return 0;
542 /* Clear NAT section of all conntracks, in case we're loaded again. */
543 static int clean_nat(struct ip_conntrack *i, void *data)
545 memset(&i->nat, 0, sizeof(i->nat));
546 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
547 return 0;
550 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
551 void ip_nat_cleanup(void)
553 ip_ct_iterate_cleanup(&clean_nat, NULL);
554 ip_conntrack_destroyed = NULL;
555 vfree(bysource);