initial commit with v2.6.9
[linux-2.6.9-moxart.git] / net / ipv4 / netfilter / ip_nat_core.c
blob1ecc3f28516ab2920d9663ccbbb9455e29d20aed
1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
18 #include <net/icmp.h>
19 #include <net/ip.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
37 #if 0
38 #define DEBUGP printk
39 #else
40 #define DEBUGP(format, args...)
41 #endif
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
51 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
54 /* We keep extra hashes for each conntrack, for fast searching. */
55 static inline size_t
56 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
58 /* Modified src and dst, to ensure we don't create two
59 identical streams. */
60 return (src + dst + proto) % ip_nat_htable_size;
63 static inline size_t
64 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
66 /* Original src, to ensure we map it consistently if poss. */
67 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
70 /* Noone using conntrack by the time this called. */
71 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
73 struct ip_nat_info *info = &conn->nat.info;
74 unsigned int hs, hp;
76 if (!info->initialized)
77 return;
79 hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
80 conn->tuplehash[IP_CT_DIR_ORIGINAL]
81 .tuple.dst.protonum);
83 hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
84 conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
85 conn->tuplehash[IP_CT_DIR_REPLY]
86 .tuple.dst.protonum);
88 WRITE_LOCK(&ip_nat_lock);
89 list_del(&info->bysource);
90 list_del(&info->byipsproto);
91 WRITE_UNLOCK(&ip_nat_lock);
94 /* We do checksum mangling, so if they were wrong before they're still
95 * wrong. Also works for incomplete packets (eg. ICMP dest
96 * unreachables.) */
97 u_int16_t
98 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
100 u_int32_t diffs[] = { oldvalinv, newval };
101 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
102 oldcheck^0xFFFF));
105 /* Is this tuple already taken? (not by us) */
107 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack *ignored_conntrack)
110 /* Conntrack tracking doesn't keep track of outgoing tuples; only
111 incoming ones. NAT means they don't have a fixed mapping,
112 so we invert the tuple and look for the incoming reply.
114 We could keep a separate hash if this proves too slow. */
115 struct ip_conntrack_tuple reply;
117 invert_tuplepr(&reply, tuple);
118 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
121 /* Does tuple + the source manip come within the range mr */
122 static int
123 in_range(const struct ip_conntrack_tuple *tuple,
124 const struct ip_conntrack_manip *manip,
125 const struct ip_nat_multi_range *mr)
127 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
128 unsigned int i;
129 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
131 for (i = 0; i < mr->rangesize; i++) {
132 /* If we are allowed to map IPs, then we must be in the
133 range specified, otherwise we must be unchanged. */
134 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
135 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
136 || (ntohl(newtuple.src.ip)
137 > ntohl(mr->range[i].max_ip)))
138 continue;
139 } else {
140 if (newtuple.src.ip != tuple->src.ip)
141 continue;
144 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
145 || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
146 &mr->range[i].min, &mr->range[i].max))
147 return 1;
149 return 0;
152 static inline int
153 src_cmp(const struct ip_conntrack *ct,
154 const struct ip_conntrack_tuple *tuple,
155 const struct ip_nat_multi_range *mr)
157 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
158 == tuple->dst.protonum
159 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
160 == tuple->src.ip
161 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
162 == tuple->src.u.all
163 && in_range(tuple,
164 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src, mr));
167 /* Only called for SRC manip */
168 static struct ip_conntrack_manip *
169 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
170 const struct ip_nat_multi_range *mr)
172 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
173 struct ip_conntrack *ct;
175 MUST_BE_READ_LOCKED(&ip_nat_lock);
176 list_for_each_entry(ct, &bysource[h], nat.info.bysource)
177 if (src_cmp(ct, tuple, mr))
178 return &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
179 return NULL;
182 #ifdef CONFIG_IP_NF_NAT_LOCAL
183 /* If it's really a local destination manip, it may need to do a
184 source manip too. */
185 static int
186 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
188 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
189 struct rtable *rt;
191 /* FIXME: IPTOS_TOS(iph->tos) --RR */
192 if (ip_route_output_key(&rt, &fl) != 0) {
193 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
194 NIPQUAD(var_ip));
195 return 0;
198 *other_ipp = rt->rt_src;
199 ip_rt_put(rt);
200 return 1;
202 #endif
204 /* Simple way to iterate through all. */
205 static inline int fake_cmp(const struct ip_conntrack *ct,
206 u_int32_t src, u_int32_t dst, u_int16_t protonum,
207 unsigned int *score, const struct ip_conntrack *ct2)
209 /* Compare backwards: we're dealing with OUTGOING tuples, and
210 inside the conntrack is the REPLY tuple. Don't count this
211 conntrack. */
212 if (ct != ct2
213 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
214 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
215 && (ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum == protonum))
216 (*score)++;
217 return 0;
220 static inline unsigned int
221 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
222 const struct ip_conntrack *conntrack)
224 struct ip_conntrack *ct;
225 unsigned int score = 0;
226 unsigned int h;
228 MUST_BE_READ_LOCKED(&ip_nat_lock);
229 h = hash_by_ipsproto(src, dst, protonum);
230 list_for_each_entry(ct, &byipsproto[h], nat.info.byipsproto)
231 fake_cmp(ct, src, dst, protonum, &score, conntrack);
233 return score;
236 /* For [FUTURE] fragmentation handling, we want the least-used
237 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
238 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
239 1-65535, we don't do pro-rata allocation based on ports; we choose
240 the ip with the lowest src-ip/dst-ip/proto usage.
242 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
243 range), we eliminate that and try again. This is not the most
244 efficient approach, but if you're worried about that, don't hand us
245 ranges you don't really have. */
246 static struct ip_nat_range *
247 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
248 const struct ip_nat_multi_range *mr,
249 const struct ip_conntrack *conntrack,
250 unsigned int hooknum)
252 unsigned int i;
253 struct {
254 const struct ip_nat_range *range;
255 unsigned int score;
256 struct ip_conntrack_tuple tuple;
257 } best = { NULL, 0xFFFFFFFF };
258 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
259 static unsigned int randomness;
261 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
262 var_ipp = &tuple->src.ip;
263 saved_ip = tuple->dst.ip;
264 other_ipp = &tuple->dst.ip;
265 } else {
266 var_ipp = &tuple->dst.ip;
267 saved_ip = tuple->src.ip;
268 other_ipp = &tuple->src.ip;
270 /* Don't do do_extra_mangle unless necessary (overrides
271 explicit socket bindings, for example) */
272 orig_dstip = tuple->dst.ip;
274 IP_NF_ASSERT(mr->rangesize >= 1);
275 for (i = 0; i < mr->rangesize; i++) {
276 /* Host order */
277 u_int32_t minip, maxip, j;
279 /* Don't do ranges which are already eliminated. */
280 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
281 continue;
284 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
285 minip = ntohl(mr->range[i].min_ip);
286 maxip = ntohl(mr->range[i].max_ip);
287 } else
288 minip = maxip = ntohl(*var_ipp);
290 randomness++;
291 for (j = 0; j < maxip - minip + 1; j++) {
292 unsigned int score;
294 *var_ipp = htonl(minip + (randomness + j)
295 % (maxip - minip + 1));
297 /* Reset the other ip in case it was mangled by
298 * do_extra_mangle last time. */
299 *other_ipp = saved_ip;
301 #ifdef CONFIG_IP_NF_NAT_LOCAL
302 if (hooknum == NF_IP_LOCAL_OUT
303 && *var_ipp != orig_dstip
304 && !do_extra_mangle(*var_ipp, other_ipp)) {
305 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
306 i, NIPQUAD(*var_ipp));
307 /* Can't route? This whole range part is
308 * probably screwed, but keep trying
309 * anyway. */
310 continue;
312 #endif
314 /* Count how many others map onto this. */
315 score = count_maps(tuple->src.ip, tuple->dst.ip,
316 tuple->dst.protonum, conntrack);
317 if (score < best.score) {
318 /* Optimization: doesn't get any better than
319 this. */
320 if (score == 0)
321 return (struct ip_nat_range *)
322 &mr->range[i];
324 best.score = score;
325 best.tuple = *tuple;
326 best.range = &mr->range[i];
330 *tuple = best.tuple;
332 /* Discard const. */
333 return (struct ip_nat_range *)best.range;
336 /* Fast version doesn't iterate through hash chains, but only handles
337 common case of single IP address (null NAT, masquerade) */
338 static struct ip_nat_range *
339 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
340 const struct ip_nat_multi_range *mr,
341 const struct ip_conntrack *conntrack,
342 unsigned int hooknum)
344 if (mr->rangesize != 1
345 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
346 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
347 && mr->range[0].min_ip != mr->range[0].max_ip))
348 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
350 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
351 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
352 tuple->src.ip = mr->range[0].min_ip;
353 else {
354 /* Only do extra mangle when required (breaks
355 socket binding) */
356 #ifdef CONFIG_IP_NF_NAT_LOCAL
357 if (tuple->dst.ip != mr->range[0].min_ip
358 && hooknum == NF_IP_LOCAL_OUT
359 && !do_extra_mangle(mr->range[0].min_ip,
360 &tuple->src.ip))
361 return NULL;
362 #endif
363 tuple->dst.ip = mr->range[0].min_ip;
367 /* Discard const. */
368 return (struct ip_nat_range *)&mr->range[0];
371 static int
372 get_unique_tuple(struct ip_conntrack_tuple *tuple,
373 const struct ip_conntrack_tuple *orig_tuple,
374 const struct ip_nat_multi_range *mrr,
375 struct ip_conntrack *conntrack,
376 unsigned int hooknum)
378 struct ip_nat_protocol *proto
379 = ip_nat_find_proto(orig_tuple->dst.protonum);
380 struct ip_nat_range *rptr;
381 unsigned int i;
382 int ret;
384 /* We temporarily use flags for marking full parts, but we
385 always clean up afterwards */
386 struct ip_nat_multi_range *mr = (void *)mrr;
388 /* 1) If this srcip/proto/src-proto-part is currently mapped,
389 and that same mapping gives a unique tuple within the given
390 range, use that.
392 This is only required for source (ie. NAT/masq) mappings.
393 So far, we don't do local source mappings, so multiple
394 manips not an issue. */
395 if (hooknum == NF_IP_POST_ROUTING) {
396 struct ip_conntrack_manip *manip;
398 manip = find_appropriate_src(orig_tuple, mr);
399 if (manip) {
400 /* Apply same source manipulation. */
401 *tuple = ((struct ip_conntrack_tuple)
402 { *manip, orig_tuple->dst });
403 DEBUGP("get_unique_tuple: Found current src map\n");
404 if (!ip_nat_used_tuple(tuple, conntrack))
405 return 1;
409 /* 2) Select the least-used IP/proto combination in the given
410 range.
412 *tuple = *orig_tuple;
413 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
414 != NULL) {
415 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
416 /* 3) The per-protocol part of the manip is made to
417 map into the range to make a unique tuple. */
419 /* Only bother mapping if it's not already in range
420 and unique */
421 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
422 || proto->in_range(tuple, HOOK2MANIP(hooknum),
423 &rptr->min, &rptr->max))
424 && !ip_nat_used_tuple(tuple, conntrack)) {
425 ret = 1;
426 goto clear_fulls;
427 } else {
428 if (proto->unique_tuple(tuple, rptr,
429 HOOK2MANIP(hooknum),
430 conntrack)) {
431 /* Must be unique. */
432 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
433 conntrack));
434 ret = 1;
435 goto clear_fulls;
436 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
437 /* Try implicit source NAT; protocol
438 may be able to play with ports to
439 make it unique. */
440 struct ip_nat_range r
441 = { IP_NAT_RANGE_MAP_IPS,
442 tuple->src.ip, tuple->src.ip,
443 { 0 }, { 0 } };
444 DEBUGP("Trying implicit mapping\n");
445 if (proto->unique_tuple(tuple, &r,
446 IP_NAT_MANIP_SRC,
447 conntrack)) {
448 /* Must be unique. */
449 IP_NF_ASSERT(!ip_nat_used_tuple
450 (tuple, conntrack));
451 ret = 1;
452 goto clear_fulls;
455 DEBUGP("Protocol can't get unique tuple %u.\n",
456 hooknum);
459 /* Eliminate that from range, and try again. */
460 rptr->flags |= IP_NAT_RANGE_FULL;
461 *tuple = *orig_tuple;
464 ret = 0;
466 clear_fulls:
467 /* Clear full flags. */
468 IP_NF_ASSERT(mr->rangesize >= 1);
469 for (i = 0; i < mr->rangesize; i++)
470 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
472 return ret;
475 /* Where to manip the reply packets (will be reverse manip). */
476 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
477 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
478 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
479 #ifdef CONFIG_IP_NF_NAT_LOCAL
480 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
481 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
482 #endif
485 unsigned int
486 ip_nat_setup_info(struct ip_conntrack *conntrack,
487 const struct ip_nat_multi_range *mr,
488 unsigned int hooknum)
490 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
491 struct ip_conntrack_tuple orig_tp;
492 struct ip_nat_info *info = &conntrack->nat.info;
493 int in_hashes = info->initialized;
495 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
496 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
497 || hooknum == NF_IP_POST_ROUTING
498 || hooknum == NF_IP_LOCAL_IN
499 || hooknum == NF_IP_LOCAL_OUT);
500 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
501 IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
503 /* What we've got will look like inverse of reply. Normally
504 this is what is in the conntrack, except for prior
505 manipulations (future optimization: if num_manips == 0,
506 orig_tp =
507 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
508 invert_tuplepr(&orig_tp,
509 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
511 #if 0
513 unsigned int i;
515 DEBUGP("Hook %u (%s), ", hooknum,
516 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
517 DUMP_TUPLE(&orig_tp);
518 DEBUGP("Range %p: ", mr);
519 for (i = 0; i < mr->rangesize; i++) {
520 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
522 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
523 ? " MAP_IPS" : "",
524 (mr->range[i].flags
525 & IP_NAT_RANGE_PROTO_SPECIFIED)
526 ? " PROTO_SPECIFIED" : "",
527 (mr->range[i].flags & IP_NAT_RANGE_FULL)
528 ? " FULL" : "",
529 NIPQUAD(mr->range[i].min_ip),
530 NIPQUAD(mr->range[i].max_ip),
531 mr->range[i].min.all,
532 mr->range[i].max.all);
535 #endif
537 do {
538 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
539 hooknum)) {
540 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
541 conntrack);
542 return NF_DROP;
545 #if 0
546 DEBUGP("Hook %u (%s) %p\n", hooknum,
547 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
548 conntrack);
549 DEBUGP("Original: ");
550 DUMP_TUPLE(&orig_tp);
551 DEBUGP("New: ");
552 DUMP_TUPLE(&new_tuple);
553 #endif
555 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
556 the original (A/B/C/D') and the mangled one (E/F/G/H').
558 We're only allowed to work with the SRC per-proto
559 part, so we create inverses of both to start, then
560 derive the other fields we need. */
562 /* Reply connection: simply invert the new tuple
563 (G/H/E/F') */
564 invert_tuplepr(&reply, &new_tuple);
566 /* Alter conntrack table so it recognizes replies.
567 If fail this race (reply tuple now used), repeat. */
568 } while (!ip_conntrack_alter_reply(conntrack, &reply));
570 /* FIXME: We can simply used existing conntrack reply tuple
571 here --RR */
572 /* Create inverse of original: C/D/A/B' */
573 invert_tuplepr(&inv_tuple, &orig_tp);
575 /* Has source changed?. */
576 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
577 /* In this direction, a source manip. */
578 info->manips[info->num_manips++] =
579 ((struct ip_nat_info_manip)
580 { IP_CT_DIR_ORIGINAL, hooknum,
581 IP_NAT_MANIP_SRC, new_tuple.src });
583 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
585 /* In the reverse direction, a destination manip. */
586 info->manips[info->num_manips++] =
587 ((struct ip_nat_info_manip)
588 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
589 IP_NAT_MANIP_DST, orig_tp.src });
590 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
593 /* Has destination changed? */
594 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
595 /* In this direction, a destination manip */
596 info->manips[info->num_manips++] =
597 ((struct ip_nat_info_manip)
598 { IP_CT_DIR_ORIGINAL, hooknum,
599 IP_NAT_MANIP_DST, reply.src });
601 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
603 /* In the reverse direction, a source manip. */
604 info->manips[info->num_manips++] =
605 ((struct ip_nat_info_manip)
606 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
607 IP_NAT_MANIP_SRC, inv_tuple.src });
608 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
611 /* If there's a helper, assign it; based on new tuple. */
612 if (!conntrack->master)
613 info->helper = __ip_nat_find_helper(&reply);
615 /* It's done. */
616 info->initialized |= (1 << HOOK2MANIP(hooknum));
618 if (in_hashes)
619 replace_in_hashes(conntrack, info);
620 else
621 place_in_hashes(conntrack, info);
623 return NF_ACCEPT;
626 void replace_in_hashes(struct ip_conntrack *conntrack,
627 struct ip_nat_info *info)
629 /* Source has changed, so replace in hashes. */
630 unsigned int srchash
631 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
632 .tuple.src,
633 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
634 .tuple.dst.protonum);
635 /* We place packet as seen OUTGOUNG in byips_proto hash
636 (ie. reverse dst and src of reply packet. */
637 unsigned int ipsprotohash
638 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
639 .tuple.dst.ip,
640 conntrack->tuplehash[IP_CT_DIR_REPLY]
641 .tuple.src.ip,
642 conntrack->tuplehash[IP_CT_DIR_REPLY]
643 .tuple.dst.protonum);
645 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
646 list_move(&info->bysource, &bysource[srchash]);
647 list_move(&info->byipsproto, &byipsproto[ipsprotohash]);
650 void place_in_hashes(struct ip_conntrack *conntrack,
651 struct ip_nat_info *info)
653 unsigned int srchash
654 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
655 .tuple.src,
656 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
657 .tuple.dst.protonum);
658 /* We place packet as seen OUTGOUNG in byips_proto hash
659 (ie. reverse dst and src of reply packet. */
660 unsigned int ipsprotohash
661 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
662 .tuple.dst.ip,
663 conntrack->tuplehash[IP_CT_DIR_REPLY]
664 .tuple.src.ip,
665 conntrack->tuplehash[IP_CT_DIR_REPLY]
666 .tuple.dst.protonum);
668 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
669 list_add(&info->bysource, &bysource[srchash]);
670 list_add(&info->byipsproto, &byipsproto[ipsprotohash]);
673 /* Returns true if succeeded. */
674 static int
675 manip_pkt(u_int16_t proto,
676 struct sk_buff **pskb,
677 unsigned int iphdroff,
678 const struct ip_conntrack_manip *manip,
679 enum ip_nat_manip_type maniptype)
681 struct iphdr *iph;
683 (*pskb)->nfcache |= NFC_ALTERED;
684 if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
685 return 0;
687 iph = (void *)(*pskb)->data + iphdroff;
689 /* Manipulate protcol part. */
690 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff + iph->ihl*4,
691 manip, maniptype))
692 return 0;
694 iph = (void *)(*pskb)->data + iphdroff;
696 if (maniptype == IP_NAT_MANIP_SRC) {
697 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
698 iph->check);
699 iph->saddr = manip->ip;
700 } else {
701 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
702 iph->check);
703 iph->daddr = manip->ip;
705 return 1;
708 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
709 struct sk_buff *skb)
711 struct ip_conntrack_protocol *proto;
712 int ret = 1;
714 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
715 proto = ip_ct_find_proto(skb->nh.iph->protocol);
716 if (proto->exp_matches_pkt)
717 ret = proto->exp_matches_pkt(exp, skb);
719 return ret;
722 /* Do packet manipulations according to binding. */
723 unsigned int
724 do_bindings(struct ip_conntrack *ct,
725 enum ip_conntrack_info ctinfo,
726 struct ip_nat_info *info,
727 unsigned int hooknum,
728 struct sk_buff **pskb)
730 unsigned int i;
731 struct ip_nat_helper *helper;
732 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
733 int proto = (*pskb)->nh.iph->protocol;
735 /* Need nat lock to protect against modification, but neither
736 conntrack (referenced) and helper (deleted with
737 synchronize_bh()) can vanish. */
738 READ_LOCK(&ip_nat_lock);
739 for (i = 0; i < info->num_manips; i++) {
740 if (info->manips[i].direction == dir
741 && info->manips[i].hooknum == hooknum) {
742 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
743 *pskb,
744 info->manips[i].maniptype == IP_NAT_MANIP_SRC
745 ? "SRC" : "DST",
746 NIPQUAD(info->manips[i].manip.ip),
747 htons(info->manips[i].manip.u.all));
748 if (!manip_pkt(proto, pskb, 0,
749 &info->manips[i].manip,
750 info->manips[i].maniptype)) {
751 READ_UNLOCK(&ip_nat_lock);
752 return NF_DROP;
756 helper = info->helper;
757 READ_UNLOCK(&ip_nat_lock);
759 if (helper) {
760 struct ip_conntrack_expect *exp = NULL;
761 struct list_head *cur_item;
762 int ret = NF_ACCEPT;
763 int helper_called = 0;
765 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
767 /* Always defragged for helpers */
768 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
769 & htons(IP_MF|IP_OFFSET)));
771 /* Have to grab read lock before sibling_list traversal */
772 READ_LOCK(&ip_conntrack_lock);
773 list_for_each_prev(cur_item, &ct->sibling_list) {
774 exp = list_entry(cur_item, struct ip_conntrack_expect,
775 expected_list);
777 /* if this expectation is already established, skip */
778 if (exp->sibling)
779 continue;
781 if (exp_for_packet(exp, *pskb)) {
782 /* FIXME: May be true multiple times in the
783 * case of UDP!! */
784 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
785 ret = helper->help(ct, exp, info, ctinfo,
786 hooknum, pskb);
787 if (ret != NF_ACCEPT) {
788 READ_UNLOCK(&ip_conntrack_lock);
789 return ret;
791 helper_called = 1;
794 /* Helper might want to manip the packet even when there is no
795 * matching expectation for this packet */
796 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
797 DEBUGP("calling nat helper for packet without expectation\n");
798 ret = helper->help(ct, NULL, info, ctinfo,
799 hooknum, pskb);
800 if (ret != NF_ACCEPT) {
801 READ_UNLOCK(&ip_conntrack_lock);
802 return ret;
805 READ_UNLOCK(&ip_conntrack_lock);
807 /* Adjust sequence number only once per packet
808 * (helper is called at all hooks) */
809 if (proto == IPPROTO_TCP
810 && (hooknum == NF_IP_POST_ROUTING
811 || hooknum == NF_IP_LOCAL_IN)) {
812 DEBUGP("ip_nat_core: adjusting sequence number\n");
813 /* future: put this in a l4-proto specific function,
814 * and call this function here. */
815 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
816 ret = NF_DROP;
819 return ret;
821 } else
822 return NF_ACCEPT;
824 /* not reached */
828 icmp_reply_translation(struct sk_buff **pskb,
829 struct ip_conntrack *conntrack,
830 unsigned int hooknum,
831 int dir)
833 struct {
834 struct icmphdr icmp;
835 struct iphdr ip;
836 } *inside;
837 unsigned int i;
838 struct ip_nat_info *info = &conntrack->nat.info;
839 int hdrlen;
841 if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
842 return 0;
843 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
845 /* We're actually going to mangle it beyond trivial checksum
846 adjustment, so make sure the current checksum is correct. */
847 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
848 hdrlen = (*pskb)->nh.iph->ihl * 4;
849 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
850 (*pskb)->len - hdrlen, 0)))
851 return 0;
854 /* Must be RELATED */
855 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
856 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
858 /* Redirects on non-null nats must be dropped, else they'll
859 start talking to each other without our translation, and be
860 confused... --RR */
861 if (inside->icmp.type == ICMP_REDIRECT) {
862 /* Don't care about races here. */
863 if (info->initialized
864 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
865 || info->num_manips != 0)
866 return 0;
869 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
870 *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
871 /* Note: May not be from a NAT'd host, but probably safest to
872 do translation always as if it came from the host itself
873 (even though a "host unreachable" coming from the host
874 itself is a bit weird).
876 More explanation: some people use NAT for anonymizing.
877 Also, CERT recommends dropping all packets from private IP
878 addresses (although ICMP errors from internal links with
879 such addresses are not too uncommon, as Alan Cox points
880 out) */
882 READ_LOCK(&ip_nat_lock);
883 for (i = 0; i < info->num_manips; i++) {
884 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
885 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
886 "ORIG" : "REPLY", info->manips[i].hooknum);
888 if (info->manips[i].direction != dir)
889 continue;
891 /* Mapping the inner packet is just like a normal
892 packet, except it was never src/dst reversed, so
893 where we would normally apply a dst manip, we apply
894 a src, and vice versa. */
895 if (info->manips[i].hooknum == hooknum) {
896 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
897 info->manips[i].maniptype == IP_NAT_MANIP_SRC
898 ? "DST" : "SRC",
899 NIPQUAD(info->manips[i].manip.ip),
900 ntohs(info->manips[i].manip.u.udp.port));
901 if (!manip_pkt(inside->ip.protocol, pskb,
902 (*pskb)->nh.iph->ihl*4
903 + sizeof(inside->icmp),
904 &info->manips[i].manip,
905 !info->manips[i].maniptype))
906 goto unlock_fail;
908 /* Outer packet needs to have IP header NATed like
909 it's a reply. */
911 /* Use mapping to map outer packet: 0 give no
912 per-proto mapping */
913 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
914 info->manips[i].maniptype == IP_NAT_MANIP_SRC
915 ? "SRC" : "DST",
916 NIPQUAD(info->manips[i].manip.ip));
917 if (!manip_pkt(0, pskb, 0,
918 &info->manips[i].manip,
919 info->manips[i].maniptype))
920 goto unlock_fail;
923 READ_UNLOCK(&ip_nat_lock);
925 hdrlen = (*pskb)->nh.iph->ihl * 4;
927 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
929 inside->icmp.checksum = 0;
930 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
931 (*pskb)->len - hdrlen,
932 0));
933 return 1;
935 unlock_fail:
936 READ_UNLOCK(&ip_nat_lock);
937 return 0;
940 int __init ip_nat_init(void)
942 size_t i;
944 /* Leave them the same for the moment. */
945 ip_nat_htable_size = ip_conntrack_htable_size;
947 /* One vmalloc for both hash tables */
948 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
949 if (!bysource) {
950 return -ENOMEM;
952 byipsproto = bysource + ip_nat_htable_size;
954 /* Sew in builtin protocols. */
955 WRITE_LOCK(&ip_nat_lock);
956 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
957 ip_nat_protos[i] = &ip_nat_unknown_protocol;
958 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
959 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
960 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
961 WRITE_UNLOCK(&ip_nat_lock);
963 for (i = 0; i < ip_nat_htable_size; i++) {
964 INIT_LIST_HEAD(&bysource[i]);
965 INIT_LIST_HEAD(&byipsproto[i]);
968 /* FIXME: Man, this is a hack. <SIGH> */
969 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
970 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
972 /* Initialize fake conntrack so that NAT will skip it */
973 ip_conntrack_untracked.nat.info.initialized |=
974 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
976 return 0;
979 /* Clear NAT section of all conntracks, in case we're loaded again. */
980 static int clean_nat(const struct ip_conntrack *i, void *data)
982 memset((void *)&i->nat, 0, sizeof(i->nat));
983 return 0;
986 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
987 void ip_nat_cleanup(void)
989 ip_ct_selective_cleanup(&clean_nat, NULL);
990 ip_conntrack_destroyed = NULL;
991 vfree(bysource);