release/src-rt/linux/linux-2.6/net/ipv4/netfilter/nf_nat_core.c

   1 /* NAT for netfilter; shared with compatibility layer. */
   2
   3 /* (C) 1999-2001 Paul `Rusty' Russell
   4  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 #include <linux/module.h>
  12 #include <linux/types.h>
  13 #include <linux/timer.h>
  14 #include <linux/skbuff.h>
  15 #include <linux/vmalloc.h>
  16 #include <net/checksum.h>
  17 #include <net/icmp.h>
  18 #include <net/ip.h>
  19 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
  20 #include <linux/icmp.h>
  21 #include <linux/udp.h>
  22 #include <linux/jhash.h>
  23
  24 #include <linux/netfilter_ipv4.h>
  25 #include <net/netfilter/nf_conntrack.h>
  26 #include <net/netfilter/nf_conntrack_core.h>
  27 #include <net/netfilter/nf_nat.h>
  28 #include <net/netfilter/nf_nat_protocol.h>
  29 #include <net/netfilter/nf_nat_core.h>
  30 #include <net/netfilter/nf_nat_helper.h>
  31 #include <net/netfilter/nf_conntrack_helper.h>
  32 #include <net/netfilter/nf_conntrack_l3proto.h>
  33 #include <net/netfilter/nf_conntrack_l4proto.h>
  34 #include <linux/netfilter_ipv4/ipt_cone.h>
  35
  36 #ifdef HNDCTF
  37 #include <linux/if.h>
  38 #include <linux/if_vlan.h>
  39 #include <typedefs.h>
  40 #include <osl.h>
  41 #include <ctf/hndctf.h>
  42
  43 #define NFC_CTF_ENABLED (1 << 31)
  44 #endif /* HNDCTF */
  45
  46 #if 0
  47 #define DEBUGP printk
  48 #else
  49 #define DEBUGP(format, args...)
  50 #endif
  51
  52 static DEFINE_RWLOCK(nf_nat_lock);
  53
  54 static struct nf_conntrack_l3proto *l3proto = NULL;
  55
  56 /* Calculated at init based on memory size */
  57 static unsigned int nf_nat_htable_size;
  58
  59 static struct list_head *bysource;
  60
  61 #define MAX_IP_NAT_PROTO 256
  62 static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO];
  63
  64 static inline struct nf_nat_protocol *
  65 __nf_nat_proto_find(u_int8_t protonum)
  66 {
  67         return rcu_dereference(nf_nat_protos[protonum]);
  68 }
  69
  70 struct nf_nat_protocol *
  71 nf_nat_proto_find_get(u_int8_t protonum)
  72 {
  73         struct nf_nat_protocol *p;
  74
  75         rcu_read_lock();
  76         p = __nf_nat_proto_find(protonum);
  77         if (!try_module_get(p->me))
  78                 p = &nf_nat_unknown_protocol;
  79         rcu_read_unlock();
  80
  81         return p;
  82 }
  83 EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
  84
  85 void
  86 nf_nat_proto_put(struct nf_nat_protocol *p)
  87 {
  88         module_put(p->me);
  89 }
  90 EXPORT_SYMBOL_GPL(nf_nat_proto_put);
  91
  92 /* We keep an extra hash for each conntrack, for fast searching. */
  93 static inline unsigned int
  94 hash_by_src(const struct nf_conntrack_tuple *tuple)
  95 {
  96         unsigned int hash;
  97
  98         /* Original src, to ensure we map it consistently if poss. */
  99         hash = jhash_3words((__force u32)tuple->src.u3.ip,
 100                             (__force u32)tuple->src.u.all,
 101                             tuple->dst.protonum, 0);
 102         return ((u64)hash * nf_nat_htable_size) >> 32;
 103 }
 104
 105 #ifdef HNDCTF
 106 extern int ipv4_conntrack_fastnat;
 107
 108 bool
 109 ip_conntrack_is_ipc_allowed(struct sk_buff *skb, u_int32_t hooknum)
 110 {
 111         struct net_device *dev;
 112
 113         if (!ipv4_conntrack_fastnat || !CTF_ENAB(kcih))
 114                 return FALSE;
 115
 116         if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING) {
 117                 dev = skb->dev;
 118                 if (dev->priv_flags & IFF_802_1Q_VLAN)
 119                         dev = VLAN_DEV_INFO(dev)->real_dev;
 120
 121                 /* Add ipc entry if packet is received on ctf enabled interface
 122                  * and the packet is not a defrag'd one.
 123                  */
 124                 if (ctf_isenabled(kcih, dev) && (skb->len <= dev->mtu))
 125                         skb->nfcache |= NFC_CTF_ENABLED;
 126         }
 127
 128         /* Add the cache entries only if the device has registered and
 129          * enabled ctf.
 130          */
 131         if (skb->nfcache & NFC_CTF_ENABLED)
 132                 return TRUE;
 133
 134         return FALSE;
 135 }
 136 #ifdef CONFIG_BCM_NAT_MODULE
 137 EXPORT_SYMBOL(ip_conntrack_is_ipc_allowed);
 138 #endif
 139
 140 void
 141 ip_conntrack_ipct_add(struct sk_buff *skb, u_int32_t hooknum,
 142                       struct nf_conn *ct, enum ip_conntrack_info ci,
 143                       struct nf_conntrack_tuple *manip)
 144 {
 145         ctf_ipc_t ipc_entry;
 146         struct hh_cache *hh;
 147         struct ethhdr *eth;
 148         struct iphdr *iph;
 149         struct tcphdr *tcph;
 150         u_int32_t daddr;
 151         struct rtable *rt;
 152         struct nf_conn_help *help;
 153         enum ip_conntrack_dir dir;
 154
 155         if ((skb == NULL) || (ct == NULL))
 156                 return;
 157
 158         /* Check CTF enabled */
 159         if (!ip_conntrack_is_ipc_allowed(skb, hooknum))
 160                 return;
 161
 162         /* We only add cache entires for non-helper connections and at
 163          * pre or post routing hooks.
 164          */
 165         help = nfct_help(ct);
 166         if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) ||
 167             ((hooknum != NF_IP_PRE_ROUTING) && (hooknum != NF_IP_POST_ROUTING)))
 168                 return;
 169
 170         /* Add ipc entries for connections in established state only */
 171         if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)))
 172                 return;
 173
 174         iph = ip_hdr(skb);
 175         if (iph->version != 4 ||
 176             (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP))
 177                 return;
 178
 179         if (iph->protocol == IPPROTO_TCP &&
 180             ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT &&
 181             ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT)
 182                 return;
 183
 184         dir = CTINFO2DIR(ci);
 185         if (ct->ctf_flags & (1 << dir))
 186                 return;
 187
 188         /* Do route lookup for alias address if we are doing DNAT in this
 189          * direction.
 190          */
 191         daddr = iph->daddr;
 192         if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST))
 193                 daddr = manip->dst.u3.ip;
 194
 195         /* Find the destination interface */
 196         if (skb->dst == NULL)
 197                 ip_route_input(skb, daddr, iph->saddr, iph->tos, skb->dev);
 198
 199         /* Ensure the packet belongs to a forwarding connection and it is
 200          * destined to an unicast address.
 201          */
 202         rt = (struct rtable *)skb->dst;
 203         if ((rt == NULL) || (rt->u.dst.input != ip_forward) ||
 204             (rt->rt_type != RTN_UNICAST) || (rt->u.dst.neighbour == NULL) ||
 205             ((rt->u.dst.neighbour->nud_state &
 206              (NUD_PERMANENT|NUD_REACHABLE|NUD_STALE|NUD_DELAY|NUD_PROBE)) == 0))
 207                 return;
 208
 209         memset(&ipc_entry, 0, sizeof(ipc_entry));
 210
 211         /* Init the neighboring sender address */
 212         memcpy(ipc_entry.sa.octet, eth_hdr(skb)->h_source, ETH_ALEN);
 213
 214         /* If the packet is received on a bridge device then save
 215          * the bridge cache entry pointer in the ip cache entry.
 216          * This will be referenced in the data path to update the
 217          * live counter of brc entry whenever a received packet
 218          * matches corresponding ipc entry matches.
 219          */
 220         if ((skb->dev != NULL) && ctf_isbridge(kcih, skb->dev))
 221                 ipc_entry.brcp = ctf_brc_lkup(kcih, eth_hdr(skb)->h_source);
 222
 223         hh = skb->dst->hh;
 224         if (hh != NULL) {
 225                 eth = (struct ethhdr *)(((unsigned char *)hh->hh_data) + 2);
 226                 memcpy(ipc_entry.dhost.octet, eth->h_dest, ETH_ALEN);
 227                 memcpy(ipc_entry.shost.octet, eth->h_source, ETH_ALEN);
 228         } else {
 229                 memcpy(ipc_entry.dhost.octet, rt->u.dst.neighbour->ha, ETH_ALEN);
 230                 memcpy(ipc_entry.shost.octet, skb->dst->dev->dev_addr, ETH_ALEN);
 231         }
 232
 233         tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2)));
 234
 235         /* Add ctf ipc entry for this direction */
 236         ipc_entry.tuple.sip = iph->saddr;
 237         ipc_entry.tuple.dip = iph->daddr;
 238         ipc_entry.tuple.proto = iph->protocol;
 239         ipc_entry.tuple.sp = tcph->source;
 240         ipc_entry.tuple.dp = tcph->dest;
 241
 242         ipc_entry.next = NULL;
 243
 244         /* For vlan interfaces fill the vlan id and the tag/untag actions */
 245         if (skb->dst->dev->priv_flags & IFF_802_1Q_VLAN) {
 246                 ipc_entry.txif = (void *)(VLAN_DEV_INFO(skb->dst->dev)->real_dev);
 247                 ipc_entry.vid = VLAN_DEV_INFO(skb->dst->dev)->vlan_id;
 248                 ipc_entry.action = ((VLAN_DEV_INFO(skb->dst->dev)->flags & 1) ?
 249                                     CTF_ACTION_TAG : CTF_ACTION_UNTAG);
 250         } else {
 251                 ipc_entry.txif = skb->dst->dev;
 252                 ipc_entry.action = CTF_ACTION_UNTAG;
 253         }
 254
 255         /* Update the manip ip and port */
 256         if (manip != NULL) {
 257                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
 258                         ipc_entry.nat.ip = manip->src.u3.ip;
 259                         ipc_entry.nat.port = manip->src.u.tcp.port;
 260                         ipc_entry.action |= CTF_ACTION_SNAT;
 261                 } else {
 262                         ipc_entry.nat.ip = manip->dst.u3.ip;
 263                         ipc_entry.nat.port = manip->dst.u.tcp.port;
 264                         ipc_entry.action |= CTF_ACTION_DNAT;
 265                 }
 266         }
 267
 268         /* Do bridge cache lookup to determine outgoing interface
 269          * and any vlan tagging actions if needed.
 270          */
 271         if (ctf_isbridge(kcih, ipc_entry.txif)) {
 272                 ctf_brc_t *brcp;
 273
 274                 brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet);
 275
 276                 if (brcp == NULL)
 277                         return;
 278                 else {
 279                         ipc_entry.action |= brcp->action;
 280                         ipc_entry.txif = brcp->txifp;
 281                         ipc_entry.vid = brcp->vid;
 282                 }
 283         }
 284
 285 #ifdef DEBUG
 286         printk("%s: Adding ipc entry for [%d]%u.%u.%u.%u:%u - %u.%u.%u.%u:%u\n", __FUNCTION__,
 287                         ipc_entry.tuple.proto,
 288                         NIPQUAD(ipc_entry.tuple.sip), ntohs(ipc_entry.tuple.sp),
 289                         NIPQUAD(ipc_entry.tuple.dip), ntohs(ipc_entry.tuple.dp));
 290         printk("sa %02x:%02x:%02x:%02x:%02x:%02x\n",
 291                         ipc_entry.shost.octet[0], ipc_entry.shost.octet[1],
 292                         ipc_entry.shost.octet[2], ipc_entry.shost.octet[3],
 293                         ipc_entry.shost.octet[4], ipc_entry.shost.octet[5]);
 294         printk("da %02x:%02x:%02x:%02x:%02x:%02x\n",
 295                         ipc_entry.dhost.octet[0], ipc_entry.dhost.octet[1],
 296                         ipc_entry.dhost.octet[2], ipc_entry.dhost.octet[3],
 297                         ipc_entry.dhost.octet[4], ipc_entry.dhost.octet[5]);
 298         printk("[%d] vid: %d action %x\n", hooknum, ipc_entry.vid, ipc_entry.action);
 299         if (manip != NULL)
 300                 printk("manip_ip: %u.%u.%u.%u manip_port %u\n",
 301                         NIPQUAD(ipc_entry.nat.ip), ntohs(ipc_entry.nat.port));
 302         printk("txif: %s\n", ((struct net_device *)ipc_entry.txif)->name);
 303 #endif
 304
 305         ctf_ipc_add(kcih, &ipc_entry);
 306
 307         /* Update the attributes flag to indicate a CTF conn */
 308         ct->ctf_flags |= (CTF_FLAGS_CACHED | (1 << dir));
 309
 310 }
 311 #ifdef CONFIG_BCM_NAT_MODULE
 312 EXPORT_SYMBOL(ip_conntrack_ipct_add);
 313 #endif
 314
 315 int
 316 ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout)
 317 {
 318         ctf_ipc_t *ipct;
 319         struct nf_conntrack_tuple *orig, *repl;
 320
 321         if (!CTF_ENAB(kcih))
 322                 return (0);
 323
 324         orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 325
 326         if ((orig->dst.protonum != IPPROTO_TCP) && (orig->dst.protonum != IPPROTO_UDP))
 327                 return (0);
 328
 329         repl = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
 330
 331         /* If the refresh counter of ipc entry is non zero, it indicates
 332          * that the packet transfer is active and we should not delete
 333          * the conntrack entry.
 334          */
 335         if (ct_timeout) {
 336                 ipct = ctf_ipc_lkup(kcih, orig->src.u3.ip, orig->dst.u3.ip,
 337                                     orig->dst.protonum, orig->src.u.tcp.port,
 338                                     orig->dst.u.tcp.port);
 339
 340                 /* Postpone the deletion of ct entry if there are frames
 341                  * flowing in this direction.
 342                  */
 343                 if ((ipct != NULL) && (ipct->live > 0)) {
 344                         ipct->live = 0;
 345                         ct->timeout.expires = jiffies + ct->expire_jiffies;
 346                         add_timer(&ct->timeout);
 347                         return (-1);
 348                 }
 349
 350                 ipct = ctf_ipc_lkup(kcih, repl->src.u3.ip, repl->dst.u3.ip,
 351                                     repl->dst.protonum, repl->src.u.tcp.port,
 352                                     repl->dst.u.tcp.port);
 353
 354                 if ((ipct != NULL) && (ipct->live > 0)) {
 355                         ipct->live = 0;
 356                         ct->timeout.expires = jiffies + ct->expire_jiffies;
 357                         add_timer(&ct->timeout);
 358                         return (-1);
 359                 }
 360         }
 361
 362         /* If there are no packets over this connection for timeout period
 363          * delete the entries.
 364          */
 365         ctf_ipc_delete(kcih, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum,
 366                        orig->src.u.tcp.port, orig->dst.u.tcp.port);
 367
 368         ctf_ipc_delete(kcih, repl->src.u3.ip, repl->dst.u3.ip, repl->dst.protonum,
 369                        repl->src.u.tcp.port, repl->dst.u.tcp.port);
 370
 371 #ifdef DEBUG
 372         printk("%s: Deleting the tuple %x %x %d %d %d\n",
 373                __FUNCTION__, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum,
 374                orig->src.u.tcp.port, orig->dst.u.tcp.port);
 375         printk("%s: Deleting the tuple %x %x %d %d %d\n",
 376                __FUNCTION__, repl->dst.u3.ip, repl->src.u3.ip, repl->dst.protonum,
 377                repl->dst.u.tcp.port, repl->src.u.tcp.port);
 378 #endif
 379
 380         return (0);
 381 }
 382 #endif /* HNDCTF */
 383
 384 /* Noone using conntrack by the time this called. */
 385 static void nf_nat_cleanup_conntrack(struct nf_conn *conn)
 386 {
 387         struct nf_conn_nat *nat;
 388         if (!(conn->status & IPS_NAT_DONE_MASK))
 389                 return;
 390
 391         nat = nfct_nat(conn);
 392         write_lock_bh(&nf_nat_lock);
 393         list_del(&nat->info.bysource);
 394         write_unlock_bh(&nf_nat_lock);
 395
 396         /* Detach from cone list */
 397         ipt_cone_cleanup_conntrack(nat);
 398 }
 399
 400 /* Is this tuple already taken? (not by us) */
 401 int
 402 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
 403                   const struct nf_conn *ignored_conntrack)
 404 {
 405         /* Conntrack tracking doesn't keep track of outgoing tuples; only
 406            incoming ones.  NAT means they don't have a fixed mapping,
 407            so we invert the tuple and look for the incoming reply.
 408
 409            We could keep a separate hash if this proves too slow. */
 410         struct nf_conntrack_tuple reply;
 411
 412         nf_ct_invert_tuplepr(&reply, tuple);
 413         return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
 414 }
 415 EXPORT_SYMBOL(nf_nat_used_tuple);
 416
 417 /* If we source map this tuple so reply looks like reply_tuple, will
 418  * that meet the constraints of range. */
 419 static int
 420 in_range(const struct nf_conntrack_tuple *tuple,
 421          const struct nf_nat_range *range)
 422 {
 423         struct nf_nat_protocol *proto;
 424         int ret = 0;
 425
 426         /* If we are supposed to map IPs, then we must be in the
 427            range specified, otherwise let this drag us onto a new src IP. */
 428         if (range->flags & IP_NAT_RANGE_MAP_IPS) {
 429                 if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
 430                     ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
 431                         return 0;
 432         }
 433
 434         rcu_read_lock();
 435         proto = __nf_nat_proto_find(tuple->dst.protonum);
 436         if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
 437             proto->in_range(tuple, IP_NAT_MANIP_SRC,
 438                             &range->min, &range->max))
 439                 ret = 1;
 440         rcu_read_unlock();
 441
 442         return ret;
 443 }
 444
 445 static inline int
 446 same_src(const struct nf_conn *ct,
 447          const struct nf_conntrack_tuple *tuple)
 448 {
 449         const struct nf_conntrack_tuple *t;
 450
 451         t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 452         return (t->dst.protonum == tuple->dst.protonum &&
 453                 t->src.u3.ip == tuple->src.u3.ip &&
 454                 t->src.u.all == tuple->src.u.all);
 455 }
 456
 457 /* Only called for SRC manip */
 458 static int
 459 find_appropriate_src(const struct nf_conntrack_tuple *tuple,
 460                      struct nf_conntrack_tuple *result,
 461                      const struct nf_nat_range *range)
 462 {
 463         unsigned int h = hash_by_src(tuple);
 464         struct nf_conn_nat *nat;
 465         struct nf_conn *ct;
 466
 467         read_lock_bh(&nf_nat_lock);
 468         list_for_each_entry(nat, &bysource[h], info.bysource) {
 469                 ct = (struct nf_conn *)((char *)nat - offsetof(struct nf_conn, data));
 470                 if (same_src(ct, tuple)) {
 471                         /* Copy source part from reply tuple. */
 472                         nf_ct_invert_tuplepr(result,
 473                                        &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 474                         result->dst = tuple->dst;
 475
 476                         if (in_range(result, range)) {
 477                                 read_unlock_bh(&nf_nat_lock);
 478                                 return 1;
 479                         }
 480                 }
 481         }
 482         read_unlock_bh(&nf_nat_lock);
 483         return 0;
 484 }
 485
 486 /* For [FUTURE] fragmentation handling, we want the least-used
 487    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 488    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 489    1-65535, we don't do pro-rata allocation based on ports; we choose
 490    the ip with the lowest src-ip/dst-ip/proto usage.
 491 */
 492 static void
 493 find_best_ips_proto(struct nf_conntrack_tuple *tuple,
 494                     const struct nf_nat_range *range,
 495                     const struct nf_conn *ct,
 496                     enum nf_nat_manip_type maniptype)
 497 {
 498         __be32 *var_ipp;
 499         /* Host order */
 500         u_int32_t minip, maxip, j;
 501
 502         /* No IP mapping?  Do nothing. */
 503         if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
 504                 return;
 505
 506         if (maniptype == IP_NAT_MANIP_SRC)
 507                 var_ipp = &tuple->src.u3.ip;
 508         else
 509                 var_ipp = &tuple->dst.u3.ip;
 510
 511         /* Fast path: only one choice. */
 512         if (range->min_ip == range->max_ip) {
 513                 *var_ipp = range->min_ip;
 514                 return;
 515         }
 516
 517         /* Hashing source and destination IPs gives a fairly even
 518          * spread in practice (if there are a small number of IPs
 519          * involved, there usually aren't that many connections
 520          * anyway).  The consistency means that servers see the same
 521          * client coming from the same IP (some Internet Banking sites
 522          * like this), even across reboots. */
 523         minip = ntohl(range->min_ip);
 524         maxip = ntohl(range->max_ip);
 525         j = jhash_2words((__force u32)tuple->src.u3.ip,
 526                          (__force u32)tuple->dst.u3.ip, 0);
 527         j = ((u64)j * (maxip - minip + 1)) >> 32;
 528         *var_ipp = htonl(minip + j);
 529 }
 530
 531 /* Manipulate the tuple into the range given.  For NF_IP_POST_ROUTING,
 532  * we change the source to map into the range.  For NF_IP_PRE_ROUTING
 533  * and NF_IP_LOCAL_OUT, we change the destination to map into the
 534  * range.  It might not be possible to get a unique tuple, but we try.
 535  * At worst (or if we race), we will end up with a final duplicate in
 536  * __ip_conntrack_confirm and drop the packet. */
 537 static void
 538 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 539                  const struct nf_conntrack_tuple *orig_tuple,
 540                  const struct nf_nat_range *range,
 541                  struct nf_conn *ct,
 542                  enum nf_nat_manip_type maniptype)
 543 {
 544         struct nf_nat_protocol *proto;
 545
 546         /* 1) If this srcip/proto/src-proto-part is currently mapped,
 547            and that same mapping gives a unique tuple within the given
 548            range, use that.
 549
 550            This is only required for source (ie. NAT/masq) mappings.
 551            So far, we don't do local source mappings, so multiple
 552            manips not an issue.  */
 553         if (maniptype == IP_NAT_MANIP_SRC) {
 554                 if (find_appropriate_src(orig_tuple, tuple, range)) {
 555                         DEBUGP("get_unique_tuple: Found current src map\n");
 556                         if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
 557                                 if (!nf_nat_used_tuple(tuple, ct))
 558                                         return;
 559                 }
 560         }
 561
 562         /* 2) Select the least-used IP/proto combination in the given
 563            range. */
 564         *tuple = *orig_tuple;
 565         find_best_ips_proto(tuple, range, ct, maniptype);
 566
 567         /* 3) The per-protocol part of the manip is made to map into
 568            the range to make a unique tuple. */
 569
 570         rcu_read_lock();
 571         proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 572
 573         /* Only bother mapping if it's not already in range and unique */
 574         if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
 575                 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
 576                         if (proto->in_range(tuple, maniptype, &range->min,
 577                                             &range->max) &&
 578                             (range->min.all == range->max.all ||
 579                              !nf_nat_used_tuple(tuple, ct)))
 580                                 goto out;
 581                 } else if (!nf_nat_used_tuple(tuple, ct)) {
 582                         goto out;
 583                 }
 584         }
 585
 586         /* Last change: get protocol to try to obtain unique tuple. */
 587         proto->unique_tuple(tuple, range, maniptype, ct);
 588 out:
 589         rcu_read_unlock();
 590 }
 591
 592 unsigned int
 593 nf_nat_setup_info(struct nf_conn *ct,
 594                   const struct nf_nat_range *range,
 595                   unsigned int hooknum)
 596 {
 597         struct nf_conntrack_tuple curr_tuple, new_tuple;
 598         struct nf_conn_nat *nat = nfct_nat(ct);
 599         struct nf_nat_info *info = &nat->info;
 600         int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
 601         enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
 602
 603         NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
 604                      hooknum == NF_IP_POST_ROUTING ||
 605                      hooknum == NF_IP_LOCAL_IN ||
 606                      hooknum == NF_IP_LOCAL_OUT);
 607         BUG_ON(nf_nat_initialized(ct, maniptype));
 608
 609         /* What we've got will look like inverse of reply. Normally
 610            this is what is in the conntrack, except for prior
 611            manipulations (future optimization: if num_manips == 0,
 612            orig_tp =
 613            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
 614         nf_ct_invert_tuplepr(&curr_tuple,
 615                              &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 616
 617         get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
 618
 619         if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
 620                 struct nf_conntrack_tuple reply;
 621
 622                 /* Alter conntrack table so will recognize replies. */
 623                 nf_ct_invert_tuplepr(&reply, &new_tuple);
 624                 nf_conntrack_alter_reply(ct, &reply);
 625
 626                 /* Non-atomic: we own this at the moment. */
 627                 if (maniptype == IP_NAT_MANIP_SRC)
 628                         ct->status |= IPS_SRC_NAT;
 629                 else
 630                         ct->status |= IPS_DST_NAT;
 631         }
 632
 633         /* Place in source hash if this is the first time. */
 634         if (have_to_hash) {
 635                 unsigned int srchash;
 636
 637                 srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 638                 write_lock_bh(&nf_nat_lock);
 639                 list_add(&info->bysource, &bysource[srchash]);
 640                 write_unlock_bh(&nf_nat_lock);
 641         }
 642
 643         /* It's done. */
 644         if (maniptype == IP_NAT_MANIP_DST)
 645                 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
 646         else
 647                 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
 648
 649         return NF_ACCEPT;
 650 }
 651 EXPORT_SYMBOL(nf_nat_setup_info);
 652
 653 /* Returns true if succeeded. */
 654 static int
 655 manip_pkt(u_int16_t proto,
 656           struct sk_buff *skb,
 657           unsigned int iphdroff,
 658           const struct nf_conntrack_tuple *target,
 659           enum nf_nat_manip_type maniptype)
 660 {
 661         struct iphdr *iph;
 662         struct nf_nat_protocol *p;
 663
 664         if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
 665                 return 0;
 666
 667         iph = (void *)skb->data + iphdroff;
 668
 669         /* Manipulate protcol part. */
 670
 671         /* rcu_read_lock()ed by nf_hook_slow */
 672         p = __nf_nat_proto_find(proto);
 673         if (!p->manip_pkt(skb, iphdroff, target, maniptype))
 674                 return 0;
 675
 676         iph = (void *)skb->data + iphdroff;
 677
 678         if (maniptype == IP_NAT_MANIP_SRC) {
 679                 nf_csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
 680                 iph->saddr = target->src.u3.ip;
 681         } else {
 682                 nf_csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
 683                 iph->daddr = target->dst.u3.ip;
 684         }
 685         return 1;
 686 }
 687
 688 #if defined(CONFIG_BCM_NAT) || defined(CONFIG_BCM_NAT_MODULE)
 689 #ifndef CONFIG_BCM_NAT_MODULE
 690 inline
 691 #endif
 692 int bcm_manip_pkt(u_int16_t proto,
 693           struct sk_buff *skb,
 694           unsigned int iphdroff,
 695           const struct nf_conntrack_tuple *target,
 696           enum nf_nat_manip_type maniptype)
 697 {
 698         return manip_pkt(proto, skb, iphdroff, target, maniptype);
 699 }
 700 #ifdef CONFIG_BCM_NAT_MODULE
 701 EXPORT_SYMBOL(bcm_manip_pkt);
 702 #endif
 703 #endif
 704
 705 /* Do packet manipulations according to nf_nat_setup_info. */
 706 unsigned int nf_nat_packet(struct nf_conn *ct,
 707                            enum ip_conntrack_info ctinfo,
 708                            unsigned int hooknum,
 709                            struct sk_buff *skb)
 710 {
 711         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 712         unsigned long statusbit;
 713         enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 714
 715         if (mtype == IP_NAT_MANIP_SRC)
 716                 statusbit = IPS_SRC_NAT;
 717         else
 718                 statusbit = IPS_DST_NAT;
 719
 720         /* Invert if this is reply dir. */
 721         if (dir == IP_CT_DIR_REPLY)
 722                 statusbit ^= IPS_NAT_MASK;
 723
 724         /* Non-atomic: these bits don't change. */
 725         if (ct->status & statusbit) {
 726                 struct nf_conntrack_tuple target;
 727
 728                 /* We are aiming to look like inverse of other direction. */
 729                 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
 730 #ifdef HNDCTF
 731                 ip_conntrack_ipct_add(skb, hooknum, ct, ctinfo, &target);
 732 #endif /* HNDCTF */
 733                 if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
 734                         return NF_DROP;
 735         } else {
 736 #ifdef HNDCTF
 737 #endif /* HNDCTF */
 738         }
 739
 740         return NF_ACCEPT;
 741 }
 742 EXPORT_SYMBOL_GPL(nf_nat_packet);
 743
 744 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
 745 int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 746                                   enum ip_conntrack_info ctinfo,
 747                                   unsigned int hooknum,
 748                                   struct sk_buff *skb)
 749 {
 750         struct {
 751                 struct icmphdr icmp;
 752                 struct iphdr ip;
 753         } *inside;
 754         struct nf_conntrack_l4proto *l4proto;
 755         struct nf_conntrack_tuple inner, target;
 756         int hdrlen = ip_hdrlen(skb);
 757         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 758         unsigned long statusbit;
 759         enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
 760
 761         if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 762                 return 0;
 763
 764         inside = (void *)skb->data + hdrlen;
 765
 766         /* We're actually going to mangle it beyond trivial checksum
 767            adjustment, so make sure the current checksum is correct. */
 768         if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
 769                 return 0;
 770
 771         /* Must be RELATED */
 772         NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
 773                      skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
 774
 775         /* Redirects on non-null nats must be dropped, else they'll
 776            start talking to each other without our translation, and be
 777            confused... --RR */
 778         if (inside->icmp.type == ICMP_REDIRECT) {
 779                 /* If NAT isn't finished, assume it and drop. */
 780                 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
 781                         return 0;
 782
 783                 if (ct->status & IPS_NAT_MASK)
 784                         return 0;
 785         }
 786
 787         DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
 788                skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
 789
 790         /* rcu_read_lock()ed by nf_hook_slow */
 791         l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
 792
 793         if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
 794                              (hdrlen +
 795                               sizeof(struct icmphdr) + inside->ip.ihl * 4),
 796                              (u_int16_t)AF_INET, inside->ip.protocol,
 797                              &inner, l3proto, l4proto))
 798                 return 0;
 799
 800         /* Change inner back to look like incoming packet.  We do the
 801            opposite manip on this hook to normal, because it might not
 802            pass all hooks (locally-generated ICMP).  Consider incoming
 803            packet: PREROUTING (DST manip), routing produces ICMP, goes
 804            through POSTROUTING (which must correct the DST manip). */
 805         if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
 806                        &ct->tuplehash[!dir].tuple, !manip))
 807                 return 0;
 808
 809         if (skb->ip_summed != CHECKSUM_PARTIAL) {
 810                 /* Reloading "inside" here since manip_pkt inner. */
 811                 inside = (void *)skb->data + hdrlen;
 812                 inside->icmp.checksum = 0;
 813                 inside->icmp.checksum =
 814                         csum_fold(skb_checksum(skb, hdrlen,
 815                                                skb->len - hdrlen, 0));
 816         }
 817
 818         /* Change outer to look the reply to an incoming packet
 819          * (proto 0 means don't invert per-proto part). */
 820         if (manip == IP_NAT_MANIP_SRC)
 821                 statusbit = IPS_SRC_NAT;
 822         else
 823                 statusbit = IPS_DST_NAT;
 824
 825         /* Invert if this is reply dir. */
 826         if (dir == IP_CT_DIR_REPLY)
 827                 statusbit ^= IPS_NAT_MASK;
 828
 829         if (ct->status & statusbit) {
 830                 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
 831                 if (!manip_pkt(0, skb, 0, &target, manip))
 832                         return 0;
 833         }
 834
 835         return 1;
 836 }
 837 EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
 838
 839 /* Protocol registration. */
 840 int nf_nat_protocol_register(struct nf_nat_protocol *proto)
 841 {
 842         int ret = 0;
 843
 844         write_lock_bh(&nf_nat_lock);
 845         if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
 846                 ret = -EBUSY;
 847                 goto out;
 848         }
 849         rcu_assign_pointer(nf_nat_protos[proto->protonum], proto);
 850  out:
 851         write_unlock_bh(&nf_nat_lock);
 852         return ret;
 853 }
 854 EXPORT_SYMBOL(nf_nat_protocol_register);
 855
 856 /* Noone stores the protocol anywhere; simply delete it. */
 857 void nf_nat_protocol_unregister(struct nf_nat_protocol *proto)
 858 {
 859         write_lock_bh(&nf_nat_lock);
 860         rcu_assign_pointer(nf_nat_protos[proto->protonum],
 861                            &nf_nat_unknown_protocol);
 862         write_unlock_bh(&nf_nat_lock);
 863         synchronize_rcu();
 864 }
 865 EXPORT_SYMBOL(nf_nat_protocol_unregister);
 866
 867 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 868 int
 869 nf_nat_port_range_to_nfattr(struct sk_buff *skb,
 870                             const struct nf_nat_range *range)
 871 {
 872         NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
 873                 &range->min.tcp.port);
 874         NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
 875                 &range->max.tcp.port);
 876
 877         return 0;
 878
 879 nfattr_failure:
 880         return -1;
 881 }
 882 EXPORT_SYMBOL_GPL(nf_nat_port_nfattr_to_range);
 883
 884 int
 885 nf_nat_port_nfattr_to_range(struct nfattr *tb[], struct nf_nat_range *range)
 886 {
 887         int ret = 0;
 888
 889         /* we have to return whether we actually parsed something or not */
 890
 891         if (tb[CTA_PROTONAT_PORT_MIN-1]) {
 892                 ret = 1;
 893                 range->min.tcp.port =
 894                         *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
 895         }
 896
 897         if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
 898                 if (ret)
 899                         range->max.tcp.port = range->min.tcp.port;
 900         } else {
 901                 ret = 1;
 902                 range->max.tcp.port =
 903                         *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
 904         }
 905
 906         return ret;
 907 }
 908 EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nfattr);
 909 #endif
 910
 911 static int __init nf_nat_init(void)
 912 {
 913         size_t i;
 914
 915         /* Leave them the same for the moment. */
 916         nf_nat_htable_size = nf_conntrack_htable_size;
 917
 918         /* One vmalloc for both hash tables */
 919         bysource = vmalloc(sizeof(struct list_head) * nf_nat_htable_size);
 920         if (!bysource)
 921                 return -ENOMEM;
 922
 923         /* Sew in builtin protocols. */
 924         write_lock_bh(&nf_nat_lock);
 925         for (i = 0; i < MAX_IP_NAT_PROTO; i++)
 926                 rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol);
 927         rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
 928         rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
 929         rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
 930         write_unlock_bh(&nf_nat_lock);
 931
 932         for (i = 0; i < nf_nat_htable_size; i++) {
 933                 INIT_LIST_HEAD(&bysource[i]);
 934         }
 935
 936         /* FIXME: Man, this is a hack.  <SIGH> */
 937         NF_CT_ASSERT(rcu_dereference(nf_conntrack_destroyed) == NULL);
 938         rcu_assign_pointer(nf_conntrack_destroyed, nf_nat_cleanup_conntrack);
 939
 940         NF_CT_ASSERT(rcu_dereference(nf_ct_nat_offset) == NULL);
 941         rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
 942
 943         /* Initialize fake conntrack so that NAT will skip it */
 944         nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
 945
 946         l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
 947         return 0;
 948 }
 949
 950 /* Clear NAT section of all conntracks, in case we're loaded again. */
 951 static int clean_nat(struct nf_conn *i, void *data)
 952 {
 953         struct nf_conn_nat *nat = nfct_nat(i);
 954
 955         if (!nat)
 956                 return 0;
 957         memset(nat, 0, sizeof(*nat));
 958         i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
 959         return 0;
 960 }
 961
 962 static void __exit nf_nat_cleanup(void)
 963 {
 964         nf_ct_iterate_cleanup(&clean_nat, NULL);
 965         rcu_assign_pointer(nf_conntrack_destroyed, NULL);
 966         rcu_assign_pointer(nf_ct_nat_offset, NULL);
 967         synchronize_rcu();
 968         vfree(bysource);
 969         nf_ct_l3proto_put(l3proto);
 970 }
 971
 972 MODULE_LICENSE("GPL");
 973
 974 module_init(nf_nat_init);
 975 module_exit(nf_nat_cleanup);