release/src-rt-6.x.4708/linux/linux-2.6.36/net/netfilter/nf_conntrack_core.c

   1 /* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
   2 /* Connection state tracking for netfilter.  This is separated from,
   3    but required by, the NAT layer; it can also be used by an iptables
   4    extension. */
   5
   6 /* (C) 1999-2001 Paul `Rusty' Russell
   7  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   8  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License version 2 as
  12  * published by the Free Software Foundation.
  13  */
  14
  15 #include <linux/types.h>
  16 #include <linux/netfilter.h>
  17 #include <linux/module.h>
  18 #include <linux/sched.h>
  19 #include <linux/skbuff.h>
  20 #include <linux/proc_fs.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/stddef.h>
  23 #include <linux/slab.h>
  24 #include <linux/random.h>
  25 #include <linux/jhash.h>
  26 #include <linux/err.h>
  27 #include <linux/percpu.h>
  28 #include <linux/moduleparam.h>
  29 #include <linux/notifier.h>
  30 #include <linux/kernel.h>
  31 #include <linux/netdevice.h>
  32 #include <linux/socket.h>
  33 #include <linux/mm.h>
  34 #include <linux/nsproxy.h>
  35 #include <linux/rculist_nulls.h>
  36
  37 #include <net/netfilter/nf_conntrack.h>
  38 #include <net/netfilter/nf_conntrack_l3proto.h>
  39 #include <net/netfilter/nf_conntrack_l4proto.h>
  40 #include <net/netfilter/nf_conntrack_expect.h>
  41 #include <net/netfilter/nf_conntrack_helper.h>
  42 #include <net/netfilter/nf_conntrack_core.h>
  43 #include <net/netfilter/nf_conntrack_extend.h>
  44 #include <net/netfilter/nf_conntrack_acct.h>
  45 #include <net/netfilter/nf_conntrack_ecache.h>
  46 #include <net/netfilter/nf_conntrack_zones.h>
  47 #include <net/netfilter/nf_nat.h>
  48 #include <net/netfilter/nf_nat_core.h>
  49
  50 #define NF_CONNTRACK_VERSION    "0.5.0"
  51
  52 #ifdef HNDCTF
  53 #include <linux/if.h>
  54 #include <linux/if_vlan.h>
  55 #include <linux/if_pppox.h>
  56 #include <linux/in.h>
  57 #include <linux/ip.h>
  58 #include <linux/tcp.h>
  59
  60 #ifdef CONFIG_IPV6
  61 #include <linux/ipv6.h>
  62 #include <net/ipv6.h>
  63 #include <net/ip6_route.h>
  64 #define IPVERSION_IS_4(ipver)           ((ipver) == 4)
  65 #else
  66 #define IPVERSION_IS_4(ipver)           1
  67 #endif /* CONFIG_IPV6 */
  68
  69 #include <net/ip.h>
  70 #include <net/route.h>
  71 #include <typedefs.h>
  72 #include <osl.h>
  73 #include <ctf/hndctf.h>
  74
  75 #define NFC_CTF_ENABLED (1 << 31)
  76 #endif /* HNDCTF */
  77
  78 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  79                                       enum nf_nat_manip_type manip,
  80                                       const struct nlattr *attr) __read_mostly;
  81 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  82
  83 DEFINE_SPINLOCK(nf_conntrack_lock);
  84 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
  85
  86 unsigned int nf_conntrack_htable_size __read_mostly;
  87 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
  88
  89 unsigned int nf_conntrack_max __read_mostly;
  90 EXPORT_SYMBOL_GPL(nf_conntrack_max);
  91
  92 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
  93 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
  94
  95 #ifdef HNDCTF
  96 /*
  97  *      Display an IP address in readable format.
  98  */
  99 bool
 100 ip_conntrack_is_ipc_allowed(struct sk_buff *skb, u_int32_t hooknum)
 101 {
 102         struct net_device *dev;
 103
 104         if (!CTF_ENAB(kcih))
 105                 return FALSE;
 106
 107         if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_POST_ROUTING) {
 108                 dev = skb->dev;
 109                 if (dev->priv_flags & IFF_802_1Q_VLAN)
 110                         dev = vlan_dev_real_dev(dev);
 111
 112                 /* Add ipc entry if packet is received on ctf enabled interface
 113                  * and the packet is not a defrag'd one.
 114                  */
 115                 if (ctf_isenabled(kcih, dev) && (skb->len <= dev->mtu))
 116                         skb->nfcache |= NFC_CTF_ENABLED;
 117         }
 118
 119         /* Add the cache entries only if the device has registered and
 120          * enabled ctf.
 121          */
 122         if (skb->nfcache & NFC_CTF_ENABLED)
 123                 return TRUE;
 124
 125         return FALSE;
 126 }
 127
 128 void
 129 ip_conntrack_ipct_add(struct sk_buff *skb, u_int32_t hooknum,
 130                       struct nf_conn *ct, enum ip_conntrack_info ci,
 131                       struct nf_conntrack_tuple *manip)
 132 {
 133         ctf_ipc_t ipc_entry;
 134         struct hh_cache *hh;
 135         struct ethhdr *eth;
 136         struct iphdr *iph;
 137         struct tcphdr *tcph;
 138         struct rtable *rt;
 139         struct nf_conn_help *help;
 140         enum ip_conntrack_dir dir;
 141         uint8 ipver, protocol;
 142 #ifdef CONFIG_IPV6
 143         struct ipv6hdr *ip6h = NULL;
 144 #endif /* CONFIG_IPV6 */
 145         uint32 nud_flags;
 146
 147         if ((skb == NULL) || (ct == NULL))
 148                 return;
 149
 150         /* Check CTF enabled */
 151         if (!ip_conntrack_is_ipc_allowed(skb, hooknum))
 152                 return;
 153         /* We only add cache entires for non-helper connections and at
 154          * pre or post routing hooks.
 155          */
 156         help = nfct_help(ct);
 157         if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) ||
 158             ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING)))
 159                 return;
 160
 161         iph = ip_hdr(skb);
 162         ipver = iph->version;
 163
 164         /* Support both IPv4 and IPv6 */
 165         if (ipver == 4) {
 166                 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2)));
 167                 protocol = iph->protocol;
 168         }
 169 #ifdef CONFIG_IPV6
 170         else if (ipver == 6) {
 171                 ip6h = (struct ipv6hdr *)iph;
 172                 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol);
 173                 if (tcph == NULL)
 174                         return;
 175         }
 176 #endif /* CONFIG_IPV6 */
 177         else
 178                 return;
 179
 180         /* Only TCP and UDP are supported */
 181         if (protocol == IPPROTO_TCP) {
 182                 /* Add ipc entries for connections in established state only */
 183                 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)))
 184                         return;
 185
 186                 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT &&
 187                         ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT)
 188                         return;
 189         }
 190         else if (protocol != IPPROTO_UDP)
 191                 return;
 192
 193         dir = CTINFO2DIR(ci);
 194         if (ct->ctf_flags & (1 << dir))
 195                 return;
 196
 197         /* Do route lookup for alias address if we are doing DNAT in this
 198          * direction.
 199          */
 200         if (skb_dst(skb) == NULL) {
 201                 /* Find the destination interface */
 202                 if (IPVERSION_IS_4(ipver)) {
 203                         u_int32_t daddr;
 204
 205                         if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST))
 206                                 daddr = manip->dst.u3.ip;
 207                         else
 208                                 daddr = iph->daddr;
 209                         ip_route_input(skb, daddr, iph->saddr, iph->tos, skb->dev);
 210                 }
 211 #ifdef CONFIG_IPV6
 212                 else
 213                         ip6_route_input(skb);
 214 #endif /* CONFIG_IPV6 */
 215         }
 216
 217         /* Ensure the packet belongs to a forwarding connection and it is
 218          * destined to an unicast address.
 219          */
 220         rt = (struct rtable *)skb_dst(skb);
 221
 222         nud_flags = NUD_PERMANENT | NUD_REACHABLE | NUD_STALE | NUD_DELAY | NUD_PROBE;
 223 #ifdef CTF_PPPOE
 224         if ((skb_dst(skb) != NULL) && (skb_dst(skb)->dev != NULL) &&
 225             (skb_dst(skb)->dev->flags & IFF_POINTOPOINT))
 226                 nud_flags |= NUD_NOARP;
 227 #endif
 228
 229         if ((rt == NULL) || (
 230 #ifdef CONFIG_IPV6
 231                         !IPVERSION_IS_4(ipver) ?
 232                          ((rt->dst.input != ip6_forward) ||
 233                           !(ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_UNICAST)) :
 234 #endif /* CONFIG_IPV6 */
 235                          ((rt->dst.input != ip_forward) || (rt->rt_type != RTN_UNICAST))) ||
 236                         (rt->dst.neighbour == NULL) ||
 237                         ((rt->dst.neighbour->nud_state & nud_flags) == 0))
 238                 return;
 239
 240         memset(&ipc_entry, 0, sizeof(ipc_entry));
 241
 242         /* Init the neighboring sender address */
 243         memcpy(ipc_entry.sa.octet, eth_hdr(skb)->h_source, ETH_ALEN);
 244
 245         /* If the packet is received on a bridge device then save
 246          * the bridge cache entry pointer in the ip cache entry.
 247          * This will be referenced in the data path to update the
 248          * live counter of brc entry whenever a received packet
 249          * matches corresponding ipc entry matches.
 250          */
 251         if ((skb->dev != NULL) && ctf_isbridge(kcih, skb->dev))
 252                 ipc_entry.brcp = ctf_brc_lkup(kcih, eth_hdr(skb)->h_source);
 253
 254         hh = skb_dst(skb)->hh;
 255         if (hh != NULL) {
 256                 eth = (struct ethhdr *)(((unsigned char *)hh->hh_data) + 2);
 257                 memcpy(ipc_entry.dhost.octet, eth->h_dest, ETH_ALEN);
 258                 memcpy(ipc_entry.shost.octet, eth->h_source, ETH_ALEN);
 259         } else {
 260                 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN);
 261                 memcpy(ipc_entry.shost.octet, skb_dst(skb)->dev->dev_addr, ETH_ALEN);
 262         }
 263
 264         /* Add ctf ipc entry for this direction */
 265         if (IPVERSION_IS_4(ipver)) {
 266                 ipc_entry.tuple.sip[0] = iph->saddr;
 267                 ipc_entry.tuple.dip[0] = iph->daddr;
 268 #ifdef CONFIG_IPV6
 269         }       else {
 270                 memcpy(ipc_entry.tuple.sip, &ip6h->saddr, sizeof(ipc_entry.tuple.sip));
 271                 memcpy(ipc_entry.tuple.dip, &ip6h->daddr, sizeof(ipc_entry.tuple.dip));
 272 #endif /* CONFIG_IPV6 */
 273         }
 274         ipc_entry.tuple.proto = protocol;
 275         ipc_entry.tuple.sp = tcph->source;
 276         ipc_entry.tuple.dp = tcph->dest;
 277
 278         ipc_entry.next = NULL;
 279
 280         /* For vlan interfaces fill the vlan id and the tag/untag actions */
 281
 282         if(!CTFQOS_ULDL_DIFFIF(kcih)){
 283                 if (skb_dst(skb)->dev->priv_flags & IFF_802_1Q_VLAN) {
 284                         ipc_entry.txif = (void *)vlan_dev_real_dev(skb_dst(skb)->dev);
 285                         ipc_entry.vid = vlan_dev_vlan_id(skb_dst(skb)->dev);
 286                         ipc_entry.action = ((vlan_dev_vlan_flags(skb_dst(skb)->dev) & 1) ?
 287                                                                 CTF_ACTION_TAG : CTF_ACTION_UNTAG);
 288                 } else {
 289                         ipc_entry.txif = skb_dst(skb)->dev;
 290                         ipc_entry.action = CTF_ACTION_UNTAG;
 291                 }
 292         }
 293         else{
 294                 ipc_entry.txif = skb_dst(skb)->dev;
 295                 ipc_entry.action = CTF_ACTION_UNTAG;
 296         }
 297 #ifdef CTF_PPPOE
 298         const char *vars = NULL, *dev_name = NULL;
 299
 300         /* For pppoe interfaces fill the session id and header add/del actions */
 301         if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) {
 302                 /* Transmit interface and sid will be populated by pppoe module */
 303                 ipc_entry.ppp_ifp = skb_dst(skb)->dev;
 304                 dev_name = skb_dst(skb)->dev->name;
 305         } else if (skb->dev->flags & IFF_POINTOPOINT) {
 306                 ipc_entry.ppp_ifp = skb->dev;
 307                 dev_name = skb->dev->name;
 308         } else{
 309                 ipc_entry.ppp_ifp = NULL;
 310                 ipc_entry.pppoe_sid = 0xffff;
 311         }
 312
 313         if (ipc_entry.ppp_ifp){
 314                 struct net_device  *pppox_tx_dev=NULL;
 315                 ctf_ppp_t ctfppp;
 316
 317
 318                 if (ppp_get_conn_pkt_info(ipc_entry.ppp_ifp,&ctfppp))
 319                         return;
 320                 else {
 321                         if(ctfppp.psk.pppox_protocol == PX_PROTO_OE){
 322                                 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) {
 323                                         ipc_entry.action |= CTF_ACTION_PPPOE_ADD;
 324                                         pppox_tx_dev = ctfppp.psk.po->pppoe_dev;
 325                                         memcpy(ipc_entry.dhost.octet, ctfppp.psk.dhost.octet, ETH_ALEN);
 326                                         memcpy(ipc_entry.shost.octet, ctfppp.psk.po->pppoe_dev->dev_addr, ETH_ALEN);
 327                                 }
 328                                 else{
 329                                         ipc_entry.action |= CTF_ACTION_PPPOE_DEL;
 330                                 }
 331                                 ipc_entry.pppoe_sid = ctfppp.pppox_id;
 332                         }
 333                         else
 334                                 return;
 335
 336                         /* For vlan interfaces fill the vlan id and the tag/untag actions */
 337                         if(pppox_tx_dev){
 338                                 if(!CTFQOS_ULDL_DIFFIF(kcih)){
 339                                         if (pppox_tx_dev ->priv_flags & IFF_802_1Q_VLAN) {
 340                                                 ipc_entry.txif = (void *)vlan_dev_real_dev(pppox_tx_dev);
 341                                                 ipc_entry.vid = vlan_dev_vlan_id(pppox_tx_dev);
 342                                                 ipc_entry.action |= ((vlan_dev_vlan_flags(pppox_tx_dev) & 1) ?
 343                                                                     CTF_ACTION_TAG : CTF_ACTION_UNTAG);
 344                                         } else {
 345                                                 ipc_entry.txif = pppox_tx_dev;
 346                                                 ipc_entry.action |= CTF_ACTION_UNTAG;
 347                                         }
 348                                 }
 349                                 else{
 350                                         ipc_entry.txif = pppox_tx_dev;
 351                                         ipc_entry.action |= CTF_ACTION_UNTAG;
 352                                 }
 353                         }
 354                 }
 355         }
 356
 357 #endif /* CTF_PPPOE */
 358
 359         if (kcih->ipc_suspend) {
 360                 /* The default action is suspend */
 361                 ipc_entry.action |= CTF_ACTION_SUSPEND;
 362         }
 363
 364         /* Copy the DSCP value. ECN bits must be cleared. */
 365         if (IPVERSION_IS_4(ipver))
 366                 ipc_entry.tos = IPV4_TOS(iph);
 367 #ifdef CONFIG_IPV6
 368         else
 369                 ipc_entry.tos = IPV6_TRAFFIC_CLASS(ip6h);
 370 #endif /* CONFIG_IPV6 */
 371         ipc_entry.tos &= IPV4_TOS_DSCP_MASK;
 372         if (ipc_entry.tos)
 373                 ipc_entry.action |= CTF_ACTION_TOS;
 374
 375 #ifdef CONFIG_NF_CONNTRACK_MARK
 376         /* Initialize the mark for this connection */
 377         if (ct->mark != 0) {
 378                 ipc_entry.mark.value = ct->mark;
 379                 ipc_entry.action |= CTF_ACTION_MARK;
 380         }
 381 #endif /* CONFIG_NF_CONNTRACK_MARK */
 382
 383         /* Update the manip ip and port */
 384         if (manip != NULL) {
 385                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
 386                         ipc_entry.nat.ip = manip->src.u3.ip;
 387                         ipc_entry.nat.port = manip->src.u.tcp.port;
 388                         ipc_entry.action |= CTF_ACTION_SNAT;
 389                 } else {
 390                         ipc_entry.nat.ip = manip->dst.u3.ip;
 391                         ipc_entry.nat.port = manip->dst.u.tcp.port;
 392                         ipc_entry.action |= CTF_ACTION_DNAT;
 393                 }
 394         }
 395
 396         /* Do bridge cache lookup to determine outgoing interface
 397          * and any vlan tagging actions if needed.
 398          */
 399         if(!CTFQOS_ULDL_DIFFIF(kcih)){
 400                 if (ctf_isbridge(kcih, ipc_entry.txif)) {
 401                         ctf_brc_t *brcp;
 402
 403                 brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet);
 404
 405                         if (brcp == NULL)
 406                                 return;
 407                         else {
 408                                 ipc_entry.action |= brcp->action;
 409                                 ipc_entry.txif = brcp->txifp;
 410                                 ipc_entry.vid = brcp->vid;
 411                         }
 412                 }
 413         }
 414         else{
 415                 if (ctf_isbridge(kcih, ipc_entry.txif)) {
 416                         ctf_brc_t *brcp;
 417
 418                         brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet);
 419
 420                         if (brcp == NULL)
 421                                 return;
 422                         else {
 423                                 ipc_entry.action |= brcp->action;
 424                                 if(brcp->txvifp){
 425                                         ipc_entry.txif = brcp->txvifp;
 426                                         ipc_entry.action &= ~CTF_ACTION_TAG;
 427                                         ipc_entry.action |= CTF_ACTION_UNTAG;
 428                                 }
 429                                 else
 430                                         ipc_entry.txif = brcp->txifp;
 431                                 ipc_entry.vid = brcp->vid;
 432                         }
 433                 }
 434
 435         }
 436
 437 #ifdef DEBUG
 438         if (IPVERSION_IS_4(ipver))
 439                 printk("%s: Adding ipc entry for [%d]%u.%u.%u.%u:%u - %u.%u.%u.%u:%u\n", __FUNCTION__,
 440                         ipc_entry.tuple.proto,
 441                         NIPQUAD(ipc_entry.tuple.sip[0]), ntohs(ipc_entry.tuple.sp),
 442                         NIPQUAD(ipc_entry.tuple.dip[0]), ntohs(ipc_entry.tuple.dp));
 443 #ifdef CONFIG_IPV6
 444         else
 445                 printk("\n%s: Adding ipc entry for [%d]\n"
 446                         "%08x.%08x.%08x.%08x:%u => %08x.%08x.%08x.%08x:%u\n",
 447                         __FUNCTION__, ipc_entry.tuple.proto,
 448                         ntohl(ipc_entry.tuple.sip[0]), ntohl(ipc_entry.tuple.sip[1]),
 449                         ntohl(ipc_entry.tuple.sip[2]), ntohl(ipc_entry.tuple.sip[3]),
 450                         ntohs(ipc_entry.tuple.sp),
 451                         ntohl(ipc_entry.tuple.dip[0]), ntohl(ipc_entry.tuple.dip[1]),
 452                         ntohl(ipc_entry.tuple.dip[2]), ntohl(ipc_entry.tuple.dip[3]),
 453                         ntohs(ipc_entry.tuple.dp));
 454 #endif /* CONFIG_IPV6 */
 455         printk("sa %02x:%02x:%02x:%02x:%02x:%02x\n",
 456                         ipc_entry.shost.octet[0], ipc_entry.shost.octet[1],
 457                         ipc_entry.shost.octet[2], ipc_entry.shost.octet[3],
 458                         ipc_entry.shost.octet[4], ipc_entry.shost.octet[5]);
 459         printk("da %02x:%02x:%02x:%02x:%02x:%02x\n",
 460                         ipc_entry.dhost.octet[0], ipc_entry.dhost.octet[1],
 461                         ipc_entry.dhost.octet[2], ipc_entry.dhost.octet[3],
 462                         ipc_entry.dhost.octet[4], ipc_entry.dhost.octet[5]);
 463         printk("[%d] vid: %d action %x\n", hooknum, ipc_entry.vid, ipc_entry.action);
 464         if (manip != NULL)
 465                 printk("manip_ip: %u.%u.%u.%u manip_port %u\n",
 466                         NIPQUAD(ipc_entry.nat.ip), ntohs(ipc_entry.nat.port));
 467         printk("txif: %s\n", ((struct net_device *)ipc_entry.txif)->name);
 468         if (ipc_entry.ppp_ifp) printk("pppif: %s\n", ((struct net_device *)ipc_entry.ppp_ifp)->name);
 469 #endif
 470
 471         ctf_ipc_add(kcih, &ipc_entry, !IPVERSION_IS_4(ipver));
 472
 473         /* Update the attributes flag to indicate a CTF conn */
 474         ct->ctf_flags |= (CTF_FLAGS_CACHED | (1 << dir));
 475 }
 476
 477 int
 478 ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout)
 479 {
 480         ctf_ipc_t *ipct;
 481         struct nf_conntrack_tuple *orig, *repl;
 482         ctf_ipc_t orig_ipct, repl_ipct;
 483         int ipaddr_sz;
 484         bool v6;
 485
 486         if (!CTF_ENAB(kcih))
 487                 return (0);
 488
 489         orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 490
 491         if ((orig->dst.protonum != IPPROTO_TCP) && (orig->dst.protonum != IPPROTO_UDP))
 492                 return (0);
 493
 494         repl = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
 495
 496 #ifdef CONFIG_IPV6
 497         v6 = (orig->src.l3num == AF_INET6);
 498         ipaddr_sz = (v6) ? sizeof(struct in6_addr) : sizeof(struct in_addr);
 499 #else
 500         v6 = FALSE;
 501         ipaddr_sz = sizeof(struct in_addr);
 502 #endif /* CONFIG_IPV6 */
 503
 504         memset(&orig_ipct, 0, sizeof(orig_ipct));
 505         memcpy(orig_ipct.tuple.sip, &orig->src.u3.ip, ipaddr_sz);
 506         memcpy(orig_ipct.tuple.dip, &orig->dst.u3.ip, ipaddr_sz);
 507         orig_ipct.tuple.proto = orig->dst.protonum;
 508         orig_ipct.tuple.sp = orig->src.u.tcp.port;
 509         orig_ipct.tuple.dp = orig->dst.u.tcp.port;
 510
 511         memset(&repl_ipct, 0, sizeof(repl_ipct));
 512         memcpy(repl_ipct.tuple.sip, &repl->src.u3.ip, ipaddr_sz);
 513         memcpy(repl_ipct.tuple.dip, &repl->dst.u3.ip, ipaddr_sz);
 514         repl_ipct.tuple.proto = repl->dst.protonum;
 515         repl_ipct.tuple.sp = repl->src.u.tcp.port;
 516         repl_ipct.tuple.dp = repl->dst.u.tcp.port;
 517
 518         /* If the refresh counter of ipc entry is non zero, it indicates
 519          * that the packet transfer is active and we should not delete
 520          * the conntrack entry.
 521          */
 522         if (ct_timeout) {
 523                 ipct = ctf_ipc_lkup(kcih, &orig_ipct, v6);
 524
 525                 /* Postpone the deletion of ct entry if there are frames
 526                  * flowing in this direction.
 527                  */
 528                 if ((ipct != NULL) && (ipct->live > 0)) {
 529                         ipct->live = 0;
 530                         ct->timeout.expires = jiffies + ct->expire_jiffies;
 531                         add_timer(&ct->timeout);
 532                         return (-1);
 533                 }
 534
 535                 ipct = ctf_ipc_lkup(kcih, &repl_ipct, v6);
 536
 537                 if ((ipct != NULL) && (ipct->live > 0)) {
 538                         ipct->live = 0;
 539                         ct->timeout.expires = jiffies + ct->expire_jiffies;
 540                         add_timer(&ct->timeout);
 541                         return (-1);
 542                 }
 543         }
 544
 545         /* If there are no packets over this connection for timeout period
 546          * delete the entries.
 547          */
 548         ctf_ipc_delete(kcih, &orig_ipct, v6);
 549
 550         ctf_ipc_delete(kcih, &repl_ipct, v6);
 551
 552 #ifdef DEBUG
 553         printk("%s: Deleting the tuple %x %x %d %d %d\n",
 554                __FUNCTION__, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum,
 555                orig->src.u.tcp.port, orig->dst.u.tcp.port);
 556         printk("%s: Deleting the tuple %x %x %d %d %d\n",
 557                __FUNCTION__, repl->dst.u3.ip, repl->src.u3.ip, repl->dst.protonum,
 558                repl->dst.u.tcp.port, repl->src.u.tcp.port);
 559 #endif
 560
 561         return (0);
 562 }
 563 #endif /* HNDCTF */
 564
 565
 566 static int nf_conntrack_hash_rnd_initted;
 567 static unsigned int nf_conntrack_hash_rnd;
 568
 569 static u_int32_t BCMFASTPATH_HOST __hash_conntrack(const struct nf_conntrack_tuple *tuple,
 570                                   u16 zone, unsigned int size, unsigned int rnd)
 571 {
 572         unsigned int n;
 573         u_int32_t h;
 574
 575         /* The direction must be ignored, so we hash everything up to the
 576          * destination ports (which is a multiple of 4) and treat the last
 577          * three bytes manually.
 578          */
 579         n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
 580         h = jhash2((u32 *)tuple, n,
 581                    zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
 582                                  tuple->dst.protonum));
 583
 584         return ((u64)h * size) >> 32;
 585 }
 586
 587 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
 588                                        const struct nf_conntrack_tuple *tuple)
 589 {
 590         return __hash_conntrack(tuple, zone, net->ct.htable_size,
 591                                 nf_conntrack_hash_rnd);
 592 }
 593
 594 bool
 595 nf_ct_get_tuple(const struct sk_buff *skb,
 596                 unsigned int nhoff,
 597                 unsigned int dataoff,
 598                 u_int16_t l3num,
 599                 u_int8_t protonum,
 600                 struct nf_conntrack_tuple *tuple,
 601                 const struct nf_conntrack_l3proto *l3proto,
 602                 const struct nf_conntrack_l4proto *l4proto)
 603 {
 604         memset(tuple, 0, sizeof(*tuple));
 605
 606         tuple->src.l3num = l3num;
 607         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 608                 return false;
 609
 610         tuple->dst.protonum = protonum;
 611         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 612
 613         return l4proto->pkt_to_tuple(skb, dataoff, tuple);
 614 }
 615 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 616
 617 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 618                        u_int16_t l3num, struct nf_conntrack_tuple *tuple)
 619 {
 620         struct nf_conntrack_l3proto *l3proto;
 621         struct nf_conntrack_l4proto *l4proto;
 622         unsigned int protoff;
 623         u_int8_t protonum;
 624         int ret;
 625
 626         rcu_read_lock();
 627
 628         l3proto = __nf_ct_l3proto_find(l3num);
 629         ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 630         if (ret != NF_ACCEPT) {
 631                 rcu_read_unlock();
 632                 return false;
 633         }
 634
 635         l4proto = __nf_ct_l4proto_find(l3num, protonum);
 636
 637         ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
 638                               l3proto, l4proto);
 639
 640         rcu_read_unlock();
 641         return ret;
 642 }
 643 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 644
 645 bool
 646 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 647                    const struct nf_conntrack_tuple *orig,
 648                    const struct nf_conntrack_l3proto *l3proto,
 649                    const struct nf_conntrack_l4proto *l4proto)
 650 {
 651         memset(inverse, 0, sizeof(*inverse));
 652
 653         inverse->src.l3num = orig->src.l3num;
 654         if (l3proto->invert_tuple(inverse, orig) == 0)
 655                 return false;
 656
 657         inverse->dst.dir = !orig->dst.dir;
 658
 659         inverse->dst.protonum = orig->dst.protonum;
 660         return l4proto->invert_tuple(inverse, orig);
 661 }
 662 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 663
 664 static void
 665 clean_from_lists(struct nf_conn *ct)
 666 {
 667         pr_debug("clean_from_lists(%p)\n", ct);
 668         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 669         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 670
 671         /* Destroy all pending expectations */
 672         nf_ct_remove_expectations(ct);
 673 }
 674
 675 static void
 676 destroy_conntrack(struct nf_conntrack *nfct)
 677 {
 678         struct nf_conn *ct = (struct nf_conn *)nfct;
 679         struct net *net = nf_ct_net(ct);
 680         struct nf_conntrack_l4proto *l4proto;
 681
 682         pr_debug("destroy_conntrack(%p)\n", ct);
 683         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 684         NF_CT_ASSERT(!timer_pending(&ct->timeout));
 685
 686 #ifdef HNDCTF
 687         ip_conntrack_ipct_delete(ct, 0);
 688 #endif /* HNDCTF*/
 689         /* To make sure we don't get any weird locking issues here:
 690          * destroy_conntrack() MUST NOT be called with a write lock
 691          * to nf_conntrack_lock!!! -HW */
 692         rcu_read_lock();
 693         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 694         if (l4proto && l4proto->destroy)
 695                 l4proto->destroy(ct);
 696
 697         rcu_read_unlock();
 698
 699         spin_lock_bh(&nf_conntrack_lock);
 700         /* Expectations will have been removed in clean_from_lists,
 701          * except TFTP can create an expectation on the first packet,
 702          * before connection is in the list, so we need to clean here,
 703          * too. */
 704         nf_ct_remove_expectations(ct);
 705
 706         /* We overload first tuple to link into unconfirmed list. */
 707         if (!nf_ct_is_confirmed(ct)) {
 708                 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
 709                 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 710         }
 711
 712         NF_CT_STAT_INC(net, delete);
 713         spin_unlock_bh(&nf_conntrack_lock);
 714
 715         if (ct->master)
 716                 nf_ct_put(ct->master);
 717
 718         pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 719         nf_conntrack_free(ct);
 720 }
 721
 722 void nf_ct_delete_from_lists(struct nf_conn *ct)
 723 {
 724         struct net *net = nf_ct_net(ct);
 725
 726         nf_ct_helper_destroy(ct);
 727         spin_lock_bh(&nf_conntrack_lock);
 728         /* Inside lock so preempt is disabled on module removal path.
 729          * Otherwise we can get spurious warnings. */
 730         NF_CT_STAT_INC(net, delete_list);
 731         clean_from_lists(ct);
 732         spin_unlock_bh(&nf_conntrack_lock);
 733 }
 734 EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
 735
 736 static void death_by_event(unsigned long ul_conntrack)
 737 {
 738         struct nf_conn *ct = (void *)ul_conntrack;
 739         struct net *net = nf_ct_net(ct);
 740
 741         if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
 742                 /* bad luck, let's retry again */
 743                 ct->timeout.expires = jiffies +
 744                         (random32() % net->ct.sysctl_events_retry_timeout);
 745                 add_timer(&ct->timeout);
 746                 return;
 747         }
 748         /* we've got the event delivered, now it's dying */
 749         set_bit(IPS_DYING_BIT, &ct->status);
 750         spin_lock(&nf_conntrack_lock);
 751         hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 752         spin_unlock(&nf_conntrack_lock);
 753         nf_ct_put(ct);
 754 }
 755
 756 void nf_ct_insert_dying_list(struct nf_conn *ct)
 757 {
 758         struct net *net = nf_ct_net(ct);
 759
 760         /* add this conntrack to the dying list */
 761         spin_lock_bh(&nf_conntrack_lock);
 762         hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 763                              &net->ct.dying);
 764         spin_unlock_bh(&nf_conntrack_lock);
 765         /* set a new timer to retry event delivery */
 766         setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
 767         ct->timeout.expires = jiffies +
 768                 (random32() % net->ct.sysctl_events_retry_timeout);
 769         add_timer(&ct->timeout);
 770 }
 771 EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
 772
 773 static void death_by_timeout(unsigned long ul_conntrack)
 774 {
 775         struct nf_conn *ct = (void *)ul_conntrack;
 776 #ifdef HNDCTF
 777         /* If negative error is returned it means the entry hasn't
 778          * timed out yet.
 779          */
 780         if (ip_conntrack_ipct_delete(ct, jiffies >= ct->timeout.expires ? 1 : 0) != 0)
 781                 return;
 782 #endif /* HNDCTF */
 783
 784         if (!test_bit(IPS_DYING_BIT, &ct->status) &&
 785             unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
 786                 /* destroy event was not delivered */
 787                 nf_ct_delete_from_lists(ct);
 788                 nf_ct_insert_dying_list(ct);
 789                 return;
 790         }
 791         set_bit(IPS_DYING_BIT, &ct->status);
 792         nf_ct_delete_from_lists(ct);
 793         nf_ct_put(ct);
 794 }
 795
 796 /*
 797  * Warning :
 798  * - Caller must take a reference on returned object
 799  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 800  * OR
 801  * - Caller must lock nf_conntrack_lock before calling this function
 802  */
 803 struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST
 804 __nf_conntrack_find(struct net *net, u16 zone,
 805                     const struct nf_conntrack_tuple *tuple)
 806 {
 807         struct nf_conntrack_tuple_hash *h;
 808         struct hlist_nulls_node *n;
 809         unsigned int hash = hash_conntrack(net, zone, tuple);
 810
 811         /* Disable BHs the entire time since we normally need to disable them
 812          * at least once for the stats anyway.
 813          */
 814         local_bh_disable();
 815 begin:
 816         hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
 817                 if (nf_ct_tuple_equal(tuple, &h->tuple) &&
 818                     nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
 819                         NF_CT_STAT_INC(net, found);
 820                         local_bh_enable();
 821                         return h;
 822                 }
 823                 NF_CT_STAT_INC(net, searched);
 824         }
 825         /*
 826          * if the nulls value we got at the end of this lookup is
 827          * not the expected one, we must restart lookup.
 828          * We probably met an item that was moved to another chain.
 829          */
 830         if (get_nulls_value(n) != hash) {
 831                 NF_CT_STAT_INC(net, search_restart);
 832                 goto begin;
 833         }
 834         local_bh_enable();
 835
 836         return NULL;
 837 }
 838 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 839
 840 /* Find a connection corresponding to a tuple. */
 841 struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST
 842 nf_conntrack_find_get(struct net *net, u16 zone,
 843                       const struct nf_conntrack_tuple *tuple)
 844 {
 845         struct nf_conntrack_tuple_hash *h;
 846         struct nf_conn *ct;
 847
 848         rcu_read_lock();
 849 begin:
 850         h = __nf_conntrack_find(net, zone, tuple);
 851         if (h) {
 852                 ct = nf_ct_tuplehash_to_ctrack(h);
 853                 if (unlikely(nf_ct_is_dying(ct) ||
 854                              !atomic_inc_not_zero(&ct->ct_general.use)))
 855                         h = NULL;
 856                 else {
 857                         if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
 858                                      nf_ct_zone(ct) != zone)) {
 859                                 nf_ct_put(ct);
 860                                 goto begin;
 861                         }
 862                 }
 863         }
 864         rcu_read_unlock();
 865
 866         return h;
 867 }
 868 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 869
 870 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 871                                        unsigned int hash,
 872                                        unsigned int repl_hash)
 873 {
 874         struct net *net = nf_ct_net(ct);
 875
 876         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 877                            &net->ct.hash[hash]);
 878         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 879                            &net->ct.hash[repl_hash]);
 880 }
 881
 882 void nf_conntrack_hash_insert(struct nf_conn *ct)
 883 {
 884         struct net *net = nf_ct_net(ct);
 885         unsigned int hash, repl_hash;
 886         u16 zone;
 887
 888         zone = nf_ct_zone(ct);
 889         hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 890         repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 891
 892         __nf_conntrack_hash_insert(ct, hash, repl_hash);
 893 }
 894 EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
 895
 896 /* Confirm a connection given skb; places it in hash table */
 897 int
 898 __nf_conntrack_confirm(struct sk_buff *skb)
 899 {
 900         unsigned int hash, repl_hash;
 901         struct nf_conntrack_tuple_hash *h;
 902         struct nf_conn *ct;
 903         struct nf_conn_help *help;
 904         struct hlist_nulls_node *n;
 905         enum ip_conntrack_info ctinfo;
 906         struct net *net;
 907         u16 zone;
 908
 909         ct = nf_ct_get(skb, &ctinfo);
 910         net = nf_ct_net(ct);
 911
 912         /* ipt_REJECT uses nf_conntrack_attach to attach related
 913            ICMP/TCP RST packets in other direction.  Actual packet
 914            which created connection will be IP_CT_NEW or for an
 915            expected connection, IP_CT_RELATED. */
 916         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 917                 return NF_ACCEPT;
 918
 919         zone = nf_ct_zone(ct);
 920         hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 921         repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 922
 923         /* We're not in hash table, and we refuse to set up related
 924            connections for unconfirmed conns.  But packet copies and
 925            REJECT will give spurious warnings here. */
 926         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 927
 928         /* No external references means noone else could have
 929            confirmed us. */
 930         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 931         pr_debug("Confirming conntrack %p\n", ct);
 932
 933         spin_lock_bh(&nf_conntrack_lock);
 934
 935         /* We have to check the DYING flag inside the lock to prevent
 936            a race against nf_ct_get_next_corpse() possibly called from
 937            user context, else we insert an already 'dead' hash, blocking
 938            further use of that particular connection -JM */
 939
 940         if (unlikely(nf_ct_is_dying(ct))) {
 941                 spin_unlock_bh(&nf_conntrack_lock);
 942                 return NF_ACCEPT;
 943         }
 944
 945         /* See if there's one in the list already, including reverse:
 946            NAT could have grabbed it without realizing, since we're
 947            not in the hash.  If there is, we lost race. */
 948         hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 949                 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 950                                       &h->tuple) &&
 951                     zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
 952                         goto out;
 953         hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
 954                 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 955                                       &h->tuple) &&
 956                     zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
 957                         goto out;
 958
 959         /* Remove from unconfirmed list */
 960         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 961
 962         /* Timer relative to confirmation time, not original
 963            setting time, otherwise we'd get timer wrap in
 964            weird delay cases. */
 965         ct->timeout.expires += jiffies;
 966         add_timer(&ct->timeout);
 967         atomic_inc(&ct->ct_general.use);
 968         set_bit(IPS_CONFIRMED_BIT, &ct->status);
 969
 970         /* Since the lookup is lockless, hash insertion must be done after
 971          * starting the timer and setting the CONFIRMED bit. The RCU barriers
 972          * guarantee that no other CPU can find the conntrack before the above
 973          * stores are visible.
 974          */
 975         __nf_conntrack_hash_insert(ct, hash, repl_hash);
 976         NF_CT_STAT_INC(net, insert);
 977         spin_unlock_bh(&nf_conntrack_lock);
 978
 979         help = nfct_help(ct);
 980         if (help && help->helper)
 981                 nf_conntrack_event_cache(IPCT_HELPER, ct);
 982
 983         nf_conntrack_event_cache(master_ct(ct) ?
 984                                  IPCT_RELATED : IPCT_NEW, ct);
 985         return NF_ACCEPT;
 986
 987 out:
 988         NF_CT_STAT_INC(net, insert_failed);
 989         spin_unlock_bh(&nf_conntrack_lock);
 990         return NF_DROP;
 991 }
 992 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 993
 994 /* Returns true if a connection correspondings to the tuple (required
 995    for NAT). */
 996 int
 997 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 998                          const struct nf_conn *ignored_conntrack)
 999 {
1000         struct net *net = nf_ct_net(ignored_conntrack);
1001         struct nf_conntrack_tuple_hash *h;
1002         struct hlist_nulls_node *n;
1003         struct nf_conn *ct;
1004         u16 zone = nf_ct_zone(ignored_conntrack);
1005         unsigned int hash = hash_conntrack(net, zone, tuple);
1006
1007         /* Disable BHs the entire time since we need to disable them at
1008          * least once for the stats anyway.
1009          */
1010         rcu_read_lock_bh();
1011         hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
1012                 ct = nf_ct_tuplehash_to_ctrack(h);
1013                 if (ct != ignored_conntrack &&
1014                     nf_ct_tuple_equal(tuple, &h->tuple) &&
1015                     nf_ct_zone(ct) == zone) {
1016                         NF_CT_STAT_INC(net, found);
1017                         rcu_read_unlock_bh();
1018                         return 1;
1019                 }
1020                 NF_CT_STAT_INC(net, searched);
1021         }
1022         rcu_read_unlock_bh();
1023
1024         return 0;
1025 }
1026 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1027
1028 #define NF_CT_EVICTION_RANGE    8
1029
1030 /* There's a small race here where we may free a just-assured
1031    connection.  Too bad: we're in trouble anyway. */
1032 static noinline int early_drop(struct net *net, unsigned int hash)
1033 {
1034         /* Use oldest entry, which is roughly LRU */
1035         struct nf_conntrack_tuple_hash *h;
1036         struct nf_conn *ct = NULL, *tmp;
1037         struct hlist_nulls_node *n;
1038         unsigned int i, cnt = 0;
1039         int dropped = 0;
1040
1041         rcu_read_lock();
1042         for (i = 0; i < net->ct.htable_size; i++) {
1043                 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
1044                                          hnnode) {
1045                         tmp = nf_ct_tuplehash_to_ctrack(h);
1046                         if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
1047                                 ct = tmp;
1048                         cnt++;
1049                 }
1050
1051                 if (ct != NULL) {
1052                         if (likely(!nf_ct_is_dying(ct) &&
1053                                    atomic_inc_not_zero(&ct->ct_general.use)))
1054                                 break;
1055                         else
1056                                 ct = NULL;
1057                 }
1058
1059                 if (cnt >= NF_CT_EVICTION_RANGE)
1060                         break;
1061
1062                 hash = (hash + 1) % net->ct.htable_size;
1063         }
1064         rcu_read_unlock();
1065
1066         if (!ct)
1067                 return dropped;
1068
1069 #ifdef HNDCTF
1070         ip_conntrack_ipct_delete(ct, 0);
1071 #endif /* HNDCTF */
1072
1073         if (del_timer(&ct->timeout)) {
1074                 death_by_timeout((unsigned long)ct);
1075                 dropped = 1;
1076                 NF_CT_STAT_INC_ATOMIC(net, early_drop);
1077         }
1078         nf_ct_put(ct);
1079         return dropped;
1080 }
1081
1082 struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
1083                                    const struct nf_conntrack_tuple *orig,
1084                                    const struct nf_conntrack_tuple *repl,
1085                                    gfp_t gfp)
1086 {
1087         struct nf_conn *ct;
1088
1089         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
1090                 get_random_bytes(&nf_conntrack_hash_rnd,
1091                                 sizeof(nf_conntrack_hash_rnd));
1092                 nf_conntrack_hash_rnd_initted = 1;
1093         }
1094
1095         /* We don't want any race condition at early drop stage */
1096         atomic_inc(&net->ct.count);
1097
1098         if (nf_conntrack_max &&
1099             unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1100                 unsigned int hash = hash_conntrack(net, zone, orig);
1101                 if (!early_drop(net, hash)) {
1102                         atomic_dec(&net->ct.count);
1103                         if (net_ratelimit())
1104                                 printk(KERN_WARNING
1105                                        "nf_conntrack: table full, dropping"
1106                                        " packet.\n");
1107                         return ERR_PTR(-ENOMEM);
1108                 }
1109         }
1110
1111         /*
1112          * Do not use kmem_cache_zalloc(), as this cache uses
1113          * SLAB_DESTROY_BY_RCU.
1114          */
1115         ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
1116         if (ct == NULL) {
1117                 pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
1118                 atomic_dec(&net->ct.count);
1119                 return ERR_PTR(-ENOMEM);
1120         }
1121         /*
1122          * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
1123          * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
1124          */
1125         memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
1126                sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
1127         spin_lock_init(&ct->lock);
1128         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1129         ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1130         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1131         ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
1132         /* Don't set timer yet: wait for confirmation */
1133         setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
1134         write_pnet(&ct->ct_net, net);
1135 #ifdef CONFIG_NF_CONNTRACK_ZONES
1136         if (zone) {
1137                 struct nf_conntrack_zone *nf_ct_zone;
1138
1139                 nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
1140                 if (!nf_ct_zone)
1141                         goto out_free;
1142                 nf_ct_zone->id = zone;
1143         }
1144 #endif
1145         /*
1146          * changes to lookup keys must be done before setting refcnt to 1
1147          */
1148         smp_wmb();
1149         atomic_set(&ct->ct_general.use, 1);
1150         return ct;
1151
1152 #ifdef CONFIG_NF_CONNTRACK_ZONES
1153 out_free:
1154         kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
1155         return ERR_PTR(-ENOMEM);
1156 #endif
1157 }
1158 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1159
1160 void nf_conntrack_free(struct nf_conn *ct)
1161 {
1162         struct net *net = nf_ct_net(ct);
1163
1164         nf_ct_ext_destroy(ct);
1165         atomic_dec(&net->ct.count);
1166         nf_ct_ext_free(ct);
1167         kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
1168 }
1169 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1170
1171 /* Allocate a new conntrack: we return -ENOMEM if classification
1172    failed due to stress.  Otherwise it really is unclassifiable. */
1173 static struct nf_conntrack_tuple_hash *
1174 init_conntrack(struct net *net, struct nf_conn *tmpl,
1175                const struct nf_conntrack_tuple *tuple,
1176                struct nf_conntrack_l3proto *l3proto,
1177                struct nf_conntrack_l4proto *l4proto,
1178                struct sk_buff *skb,
1179                unsigned int dataoff)
1180 {
1181         struct nf_conn *ct;
1182         struct nf_conn_help *help;
1183         struct nf_conntrack_tuple repl_tuple;
1184         struct nf_conntrack_ecache *ecache;
1185         struct nf_conntrack_expect *exp;
1186         u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
1187
1188         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1189                 pr_debug("Can't invert tuple.\n");
1190                 return NULL;
1191         }
1192
1193         ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
1194         if (IS_ERR(ct)) {
1195                 pr_debug("Can't allocate conntrack.\n");
1196                 return (struct nf_conntrack_tuple_hash *)ct;
1197         }
1198
1199         if (!l4proto->new(ct, skb, dataoff)) {
1200                 nf_conntrack_free(ct);
1201                 pr_debug("init conntrack: can't track with proto module\n");
1202                 return NULL;
1203         }
1204
1205         nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1206
1207         ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1208         nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1209                                  ecache ? ecache->expmask : 0,
1210                              GFP_ATOMIC);
1211
1212         spin_lock_bh(&nf_conntrack_lock);
1213         exp = nf_ct_find_expectation(net, zone, tuple);
1214         if (exp) {
1215                 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
1216                          ct, exp);
1217                 /* Welcome, Mr. Bond.  We've been expecting you... */
1218                 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1219                 ct->master = exp->master;
1220                 if (exp->helper) {
1221                         help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1222                         if (help)
1223                                 rcu_assign_pointer(help->helper, exp->helper);
1224                 }
1225
1226 #ifdef CONFIG_NF_CONNTRACK_MARK
1227                 ct->mark = exp->master->mark;
1228 #endif
1229 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1230                 ct->secmark = exp->master->secmark;
1231 #endif
1232                 nf_conntrack_get(&ct->master->ct_general);
1233                 NF_CT_STAT_INC(net, expect_new);
1234         } else {
1235                 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1236                 NF_CT_STAT_INC(net, new);
1237         }
1238
1239         /* Overload tuple linked list to put us in unconfirmed list. */
1240         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
1241                        &net->ct.unconfirmed);
1242
1243         spin_unlock_bh(&nf_conntrack_lock);
1244
1245         if (exp) {
1246                 if (exp->expectfn)
1247                         exp->expectfn(ct, exp);
1248                 nf_ct_expect_put(exp);
1249         }
1250
1251         return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1252 }
1253
1254 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1255 static inline struct nf_conn *
1256 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1257                   struct sk_buff *skb,
1258                   unsigned int dataoff,
1259                   u_int16_t l3num,
1260                   u_int8_t protonum,
1261                   struct nf_conntrack_l3proto *l3proto,
1262                   struct nf_conntrack_l4proto *l4proto,
1263                   int *set_reply,
1264                   enum ip_conntrack_info *ctinfo)
1265 {
1266         struct nf_conntrack_tuple tuple;
1267         struct nf_conntrack_tuple_hash *h;
1268         struct nf_conn *ct;
1269         u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
1270
1271         if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1272                              dataoff, l3num, protonum, &tuple, l3proto,
1273                              l4proto)) {
1274                 pr_debug("resolve_normal_ct: Can't get tuple\n");
1275                 return NULL;
1276         }
1277
1278         /* look for tuple match */
1279         h = nf_conntrack_find_get(net, zone, &tuple);
1280         if (!h) {
1281                 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1282                                    skb, dataoff);
1283                 if (!h)
1284                         return NULL;
1285                 if (IS_ERR(h))
1286                         return (void *)h;
1287         }
1288         ct = nf_ct_tuplehash_to_ctrack(h);
1289
1290         /* It exists; we have (non-exclusive) reference. */
1291         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1292                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1293                 /* Please set reply bit if this packet OK */
1294                 *set_reply = 1;
1295         } else {
1296                 /* Once we've had two way comms, always ESTABLISHED. */
1297                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1298                         pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
1299                         *ctinfo = IP_CT_ESTABLISHED;
1300                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1301                         pr_debug("nf_conntrack_in: related packet for %p\n",
1302                                  ct);
1303                         *ctinfo = IP_CT_RELATED;
1304                 } else {
1305                         pr_debug("nf_conntrack_in: new packet for %p\n", ct);
1306                         *ctinfo = IP_CT_NEW;
1307                 }
1308                 *set_reply = 0;
1309         }
1310         skb->nfct = &ct->ct_general;
1311         skb->nfctinfo = *ctinfo;
1312         return ct;
1313 }
1314
1315 unsigned int BCMFASTPATH_HOST
1316 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1317                 struct sk_buff *skb)
1318 {
1319         struct nf_conn *ct, *tmpl = NULL;
1320         enum ip_conntrack_info ctinfo;
1321         struct nf_conntrack_l3proto *l3proto;
1322         struct nf_conntrack_l4proto *l4proto;
1323         unsigned int dataoff;
1324         u_int8_t protonum;
1325         int set_reply = 0;
1326         int ret;
1327
1328         if (skb->nfct) {
1329                 /* Previously seen (loopback or untracked)?  Ignore. */
1330                 tmpl = (struct nf_conn *)skb->nfct;
1331                 if (!nf_ct_is_template(tmpl)) {
1332                         NF_CT_STAT_INC_ATOMIC(net, ignore);
1333                         return NF_ACCEPT;
1334                 }
1335                 skb->nfct = NULL;
1336         }
1337
1338         /* rcu_read_lock()ed by nf_hook_slow */
1339         l3proto = __nf_ct_l3proto_find(pf);
1340         ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1341                                    &dataoff, &protonum);
1342         if (ret <= 0) {
1343                 pr_debug("not prepared to track yet or error occured\n");
1344                 NF_CT_STAT_INC_ATOMIC(net, error);
1345                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1346                 ret = -ret;
1347                 goto out;
1348         }
1349
1350         l4proto = __nf_ct_l4proto_find(pf, protonum);
1351
1352         /* It may be an special packet, error, unclean...
1353          * inverse of the return code tells to the netfilter
1354          * core what to do with the packet. */
1355         if (l4proto->error != NULL) {
1356                 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
1357                                      pf, hooknum);
1358                 if (ret <= 0) {
1359                         NF_CT_STAT_INC_ATOMIC(net, error);
1360                         NF_CT_STAT_INC_ATOMIC(net, invalid);
1361                         ret = -ret;
1362                         goto out;
1363                 }
1364         }
1365
1366         ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1367                                l3proto, l4proto, &set_reply, &ctinfo);
1368         if (!ct) {
1369                 /* Not valid part of a connection */
1370                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1371                 ret = NF_ACCEPT;
1372                 goto out;
1373         }
1374
1375         if (IS_ERR(ct)) {
1376                 /* Too stressed to deal. */
1377                 NF_CT_STAT_INC_ATOMIC(net, drop);
1378                 ret = NF_DROP;
1379                 goto out;
1380         }
1381
1382         NF_CT_ASSERT(skb->nfct);
1383
1384         ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
1385         if (ret <= 0) {
1386                 /* Invalid: inverse of the return code tells
1387                  * the netfilter core what to do */
1388                 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1389                 nf_conntrack_put(skb->nfct);
1390                 skb->nfct = NULL;
1391                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1392                 if (ret == -NF_DROP)
1393                         NF_CT_STAT_INC_ATOMIC(net, drop);
1394                 ret = -ret;
1395                 goto out;
1396         }
1397
1398         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1399                 nf_conntrack_event_cache(IPCT_REPLY, ct);
1400 out:
1401         if (tmpl)
1402                 nf_ct_put(tmpl);
1403
1404         return ret;
1405 }
1406 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1407
1408 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1409                           const struct nf_conntrack_tuple *orig)
1410 {
1411         bool ret;
1412
1413         rcu_read_lock();
1414         ret = nf_ct_invert_tuple(inverse, orig,
1415                                  __nf_ct_l3proto_find(orig->src.l3num),
1416                                  __nf_ct_l4proto_find(orig->src.l3num,
1417                                                       orig->dst.protonum));
1418         rcu_read_unlock();
1419         return ret;
1420 }
1421 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1422
1423 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1424    implicitly racy: see __nf_conntrack_confirm */
1425 void nf_conntrack_alter_reply(struct nf_conn *ct,
1426                               const struct nf_conntrack_tuple *newreply)
1427 {
1428         struct nf_conn_help *help = nfct_help(ct);
1429
1430         /* Should be unconfirmed, so not in hash table yet */
1431         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1432
1433         pr_debug("Altering reply tuple of %p to ", ct);
1434         nf_ct_dump_tuple(newreply);
1435
1436         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1437         if (ct->master || (help && !hlist_empty(&help->expectations)))
1438                 return;
1439
1440         rcu_read_lock();
1441         __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1442         rcu_read_unlock();
1443 }
1444 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1445
1446 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1447 void __nf_ct_refresh_acct(struct nf_conn *ct,
1448                           enum ip_conntrack_info ctinfo,
1449                           const struct sk_buff *skb,
1450                           unsigned long extra_jiffies,
1451                           int do_acct)
1452 {
1453         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1454         NF_CT_ASSERT(skb);
1455
1456         /* Only update if this is not a fixed timeout */
1457         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1458                 goto acct;
1459
1460         /* If not in hash table, timer will not be active yet */
1461         if (!nf_ct_is_confirmed(ct)) {
1462 #ifdef HNDCTF
1463                 ct->expire_jiffies = extra_jiffies;
1464 #endif /* HNDCTF */
1465                 ct->timeout.expires = extra_jiffies;
1466         } else {
1467                 unsigned long newtime = jiffies + extra_jiffies;
1468
1469                 /* Only update the timeout if the new timeout is at least
1470                    HZ jiffies from the old timeout. Need del_timer for race
1471                    avoidance (may already be dying). */
1472                 if (newtime - ct->timeout.expires >= HZ)
1473 #ifdef HNDCTF
1474                         ct->expire_jiffies = extra_jiffies;
1475 #endif /* HNDCTF */
1476                         mod_timer_pending(&ct->timeout, newtime);
1477         }
1478
1479 acct:
1480         if (do_acct) {
1481                 struct nf_conn_counter *acct;
1482
1483                 acct = nf_conn_acct_find(ct);
1484                 if (acct) {
1485                         spin_lock_bh(&ct->lock);
1486                         acct[CTINFO2DIR(ctinfo)].packets++;
1487                         acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
1488                         spin_unlock_bh(&ct->lock);
1489                 }
1490         }
1491 }
1492 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1493
1494 bool __nf_ct_kill_acct(struct nf_conn *ct,
1495                        enum ip_conntrack_info ctinfo,
1496                        const struct sk_buff *skb,
1497                        int do_acct)
1498 {
1499         if (do_acct) {
1500                 struct nf_conn_counter *acct;
1501
1502                 acct = nf_conn_acct_find(ct);
1503                 if (acct) {
1504                         spin_lock_bh(&ct->lock);
1505                         acct[CTINFO2DIR(ctinfo)].packets++;
1506                         acct[CTINFO2DIR(ctinfo)].bytes +=
1507                                 skb->len - skb_network_offset(skb);
1508                         spin_unlock_bh(&ct->lock);
1509                 }
1510         }
1511
1512         if (del_timer(&ct->timeout)) {
1513                 ct->timeout.function((unsigned long)ct);
1514                 return true;
1515         }
1516         return false;
1517 }
1518 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
1519
1520 #ifdef CONFIG_NF_CONNTRACK_ZONES
1521 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
1522         .len    = sizeof(struct nf_conntrack_zone),
1523         .align  = __alignof__(struct nf_conntrack_zone),
1524         .id     = NF_CT_EXT_ZONE,
1525 };
1526 #endif
1527
1528 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1529
1530 #include <linux/netfilter/nfnetlink.h>
1531 #include <linux/netfilter/nfnetlink_conntrack.h>
1532 #include <linux/mutex.h>
1533
1534 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1535  * in ip_conntrack_core, since we don't want the protocols to autoload
1536  * or depend on ctnetlink */
1537 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1538                                const struct nf_conntrack_tuple *tuple)
1539 {
1540         NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
1541         NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
1542         return 0;
1543
1544 nla_put_failure:
1545         return -1;
1546 }
1547 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1548
1549 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1550         [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1551         [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1552 };
1553 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1554
1555 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1556                                struct nf_conntrack_tuple *t)
1557 {
1558         if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1559                 return -EINVAL;
1560
1561         t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1562         t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1563
1564         return 0;
1565 }
1566 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1567
1568 int nf_ct_port_nlattr_tuple_size(void)
1569 {
1570         return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1571 }
1572 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1573 #endif
1574
1575 /* Used by ipt_REJECT and ip6t_REJECT. */
1576 static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1577 {
1578         struct nf_conn *ct;
1579         enum ip_conntrack_info ctinfo;
1580
1581         /* This ICMP is in reverse direction to the packet which caused it */
1582         ct = nf_ct_get(skb, &ctinfo);
1583         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1584                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1585         else
1586                 ctinfo = IP_CT_RELATED;
1587
1588         /* Attach to new skbuff, and increment count */
1589         nskb->nfct = &ct->ct_general;
1590         nskb->nfctinfo = ctinfo;
1591         nf_conntrack_get(nskb->nfct);
1592 }
1593
1594 /* Bring out ya dead! */
1595 static struct nf_conn *
1596 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1597                 void *data, unsigned int *bucket)
1598 {
1599         struct nf_conntrack_tuple_hash *h;
1600         struct nf_conn *ct;
1601         struct hlist_nulls_node *n;
1602
1603         spin_lock_bh(&nf_conntrack_lock);
1604         for (; *bucket < net->ct.htable_size; (*bucket)++) {
1605                 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
1606                         ct = nf_ct_tuplehash_to_ctrack(h);
1607                         if (iter(ct, data))
1608                                 goto found;
1609                 }
1610         }
1611         hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
1612                 ct = nf_ct_tuplehash_to_ctrack(h);
1613                 if (iter(ct, data))
1614                         set_bit(IPS_DYING_BIT, &ct->status);
1615         }
1616         spin_unlock_bh(&nf_conntrack_lock);
1617         return NULL;
1618 found:
1619         atomic_inc(&ct->ct_general.use);
1620         spin_unlock_bh(&nf_conntrack_lock);
1621         return ct;
1622 }
1623
1624 void nf_ct_iterate_cleanup(struct net *net,
1625                            int (*iter)(struct nf_conn *i, void *data),
1626                            void *data)
1627 {
1628         struct nf_conn *ct;
1629         unsigned int bucket = 0;
1630
1631         while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1632 #ifdef HNDCTF
1633                 ip_conntrack_ipct_delete(ct, 0);
1634 #endif /* HNDCTF */
1635                 /* Time to push up daises... */
1636                 if (del_timer(&ct->timeout))
1637                         death_by_timeout((unsigned long)ct);
1638                 /* ... else the timer will get him soon. */
1639
1640                 nf_ct_put(ct);
1641         }
1642 }
1643 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1644
1645 struct __nf_ct_flush_report {
1646         u32 pid;
1647         int report;
1648 };
1649
1650 static int kill_report(struct nf_conn *i, void *data)
1651 {
1652         struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
1653
1654         /* If we fail to deliver the event, death_by_timeout() will retry */
1655         if (nf_conntrack_event_report(IPCT_DESTROY, i,
1656                                       fr->pid, fr->report) < 0)
1657                 return 1;
1658
1659         /* Avoid the delivery of the destroy event in death_by_timeout(). */
1660         set_bit(IPS_DYING_BIT, &i->status);
1661         return 1;
1662 }
1663
1664 static int kill_all(struct nf_conn *i, void *data)
1665 {
1666         return 1;
1667 }
1668
1669 void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
1670 {
1671         if (vmalloced)
1672                 vfree(hash);
1673         else
1674                 free_pages((unsigned long)hash,
1675                            get_order(sizeof(struct hlist_head) * size));
1676 }
1677 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1678
1679 void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
1680 {
1681         struct __nf_ct_flush_report fr = {
1682                 .pid    = pid,
1683                 .report = report,
1684         };
1685         nf_ct_iterate_cleanup(net, kill_report, &fr);
1686 }
1687 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
1688
1689 static void nf_ct_release_dying_list(struct net *net)
1690 {
1691         struct nf_conntrack_tuple_hash *h;
1692         struct nf_conn *ct;
1693         struct hlist_nulls_node *n;
1694
1695         spin_lock_bh(&nf_conntrack_lock);
1696         hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
1697                 ct = nf_ct_tuplehash_to_ctrack(h);
1698                 /* never fails to remove them, no listeners at this point */
1699                 nf_ct_kill(ct);
1700         }
1701         spin_unlock_bh(&nf_conntrack_lock);
1702 }
1703
1704 static int untrack_refs(void)
1705 {
1706         int cnt = 0, cpu;
1707
1708         for_each_possible_cpu(cpu) {
1709                 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1710
1711                 cnt += atomic_read(&ct->ct_general.use) - 1;
1712         }
1713         return cnt;
1714 }
1715
1716 static void nf_conntrack_cleanup_init_net(void)
1717 {
1718         while (untrack_refs() > 0)
1719                 schedule();
1720
1721         nf_conntrack_helper_fini();
1722         nf_conntrack_proto_fini();
1723 #ifdef CONFIG_NF_CONNTRACK_ZONES
1724         nf_ct_extend_unregister(&nf_ct_zone_extend);
1725 #endif
1726 }
1727
1728 static void nf_conntrack_cleanup_net(struct net *net)
1729 {
1730  i_see_dead_people:
1731         nf_ct_iterate_cleanup(net, kill_all, NULL);
1732         nf_ct_release_dying_list(net);
1733         if (atomic_read(&net->ct.count) != 0) {
1734                 schedule();
1735                 goto i_see_dead_people;
1736         }
1737
1738         nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1739                              net->ct.htable_size);
1740         nf_conntrack_ecache_fini(net);
1741         nf_conntrack_acct_fini(net);
1742         nf_conntrack_expect_fini(net);
1743         kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1744         kfree(net->ct.slabname);
1745         free_percpu(net->ct.stat);
1746 }
1747
1748 /* Mishearing the voices in his head, our hero wonders how he's
1749    supposed to kill the mall. */
1750 void nf_conntrack_cleanup(struct net *net)
1751 {
1752         if (net_eq(net, &init_net))
1753                 rcu_assign_pointer(ip_ct_attach, NULL);
1754
1755         /* This makes sure all current packets have passed through
1756            netfilter framework.  Roll on, two-stage module
1757            delete... */
1758         synchronize_net();
1759
1760         nf_conntrack_cleanup_net(net);
1761
1762         if (net_eq(net, &init_net)) {
1763                 rcu_assign_pointer(nf_ct_destroy, NULL);
1764                 nf_conntrack_cleanup_init_net();
1765         }
1766 }
1767
1768 void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
1769 {
1770         struct hlist_nulls_head *hash;
1771         unsigned int nr_slots, i;
1772         size_t sz;
1773
1774         *vmalloced = 0;
1775
1776         BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1777         nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1778         sz = nr_slots * sizeof(struct hlist_nulls_head);
1779         hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1780                                         get_order(sz));
1781         if (!hash) {
1782                 *vmalloced = 1;
1783                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1784                 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1785                                  PAGE_KERNEL);
1786         }
1787
1788         if (hash && nulls)
1789                 for (i = 0; i < nr_slots; i++)
1790                         INIT_HLIST_NULLS_HEAD(&hash[i], i);
1791
1792         return hash;
1793 }
1794 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1795
1796 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1797 {
1798         int i, bucket, vmalloced, old_vmalloced;
1799         unsigned int hashsize, old_size;
1800         struct hlist_nulls_head *hash, *old_hash;
1801         struct nf_conntrack_tuple_hash *h;
1802         struct nf_conn *ct;
1803
1804         if (current->nsproxy->net_ns != &init_net)
1805                 return -EOPNOTSUPP;
1806
1807         /* On boot, we can set this without any fancy locking. */
1808         if (!nf_conntrack_htable_size)
1809                 return param_set_uint(val, kp);
1810
1811         hashsize = simple_strtoul(val, NULL, 0);
1812         if (!hashsize)
1813                 return -EINVAL;
1814
1815         hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
1816         if (!hash)
1817                 return -ENOMEM;
1818
1819         /* Lookups in the old hash might happen in parallel, which means we
1820          * might get false negatives during connection lookup. New connections
1821          * created because of a false negative won't make it into the hash
1822          * though since that required taking the lock.
1823          */
1824         spin_lock_bh(&nf_conntrack_lock);
1825         for (i = 0; i < init_net.ct.htable_size; i++) {
1826                 while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
1827                         h = hlist_nulls_entry(init_net.ct.hash[i].first,
1828                                         struct nf_conntrack_tuple_hash, hnnode);
1829                         ct = nf_ct_tuplehash_to_ctrack(h);
1830                         hlist_nulls_del_rcu(&h->hnnode);
1831                         bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
1832                                                   hashsize,
1833                                                   nf_conntrack_hash_rnd);
1834                         hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1835                 }
1836         }
1837         old_size = init_net.ct.htable_size;
1838         old_vmalloced = init_net.ct.hash_vmalloc;
1839         old_hash = init_net.ct.hash;
1840
1841         init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1842         init_net.ct.hash_vmalloc = vmalloced;
1843         init_net.ct.hash = hash;
1844         spin_unlock_bh(&nf_conntrack_lock);
1845
1846         nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
1847         return 0;
1848 }
1849 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1850
1851 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1852                   &nf_conntrack_htable_size, 0600);
1853
1854 void nf_ct_untracked_status_or(unsigned long bits)
1855 {
1856         int cpu;
1857
1858         for_each_possible_cpu(cpu)
1859                 per_cpu(nf_conntrack_untracked, cpu).status |= bits;
1860 }
1861 EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1862
1863 static int nf_conntrack_init_init_net(void)
1864 {
1865         int max_factor = 8;
1866         int ret, cpu;
1867
1868         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1869          * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
1870         if (!nf_conntrack_htable_size) {
1871                 nf_conntrack_htable_size
1872                         = (((totalram_pages << PAGE_SHIFT) / 16384)
1873                            / sizeof(struct hlist_head));
1874                 if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1875                         nf_conntrack_htable_size = 16384;
1876                 if (nf_conntrack_htable_size < 32)
1877                         nf_conntrack_htable_size = 32;
1878
1879                 /* Use a max. factor of four by default to get the same max as
1880                  * with the old struct list_heads. When a table size is given
1881                  * we use the old value of 8 to avoid reducing the max.
1882                  * entries. */
1883                 max_factor = 4;
1884         }
1885         nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1886
1887         printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
1888                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1889                nf_conntrack_max);
1890
1891         ret = nf_conntrack_proto_init();
1892         if (ret < 0)
1893                 goto err_proto;
1894
1895         ret = nf_conntrack_helper_init();
1896         if (ret < 0)
1897                 goto err_helper;
1898
1899 #ifdef CONFIG_NF_CONNTRACK_ZONES
1900         ret = nf_ct_extend_register(&nf_ct_zone_extend);
1901         if (ret < 0)
1902                 goto err_extend;
1903 #endif
1904         /* Set up fake conntrack: to never be deleted, not in any hashes */
1905         for_each_possible_cpu(cpu) {
1906                 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1907                 write_pnet(&ct->ct_net, &init_net);
1908                 atomic_set(&ct->ct_general.use, 1);
1909         }
1910         /*  - and look it like as a confirmed connection */
1911         nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1912         return 0;
1913
1914 #ifdef CONFIG_NF_CONNTRACK_ZONES
1915 err_extend:
1916         nf_conntrack_helper_fini();
1917 #endif
1918 err_helper:
1919         nf_conntrack_proto_fini();
1920 err_proto:
1921         return ret;
1922 }
1923
1924 /*
1925  * We need to use special "null" values, not used in hash table
1926  */
1927 #define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
1928 #define DYING_NULLS_VAL         ((1<<30)+1)
1929
1930 static int nf_conntrack_init_net(struct net *net)
1931 {
1932         int ret;
1933
1934         atomic_set(&net->ct.count, 0);
1935         INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
1936         INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
1937         net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1938         if (!net->ct.stat) {
1939                 ret = -ENOMEM;
1940                 goto err_stat;
1941         }
1942
1943         net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
1944         if (!net->ct.slabname) {
1945                 ret = -ENOMEM;
1946                 goto err_slabname;
1947         }
1948
1949         net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
1950                                                         sizeof(struct nf_conn), 0,
1951                                                         SLAB_DESTROY_BY_RCU, NULL);
1952         if (!net->ct.nf_conntrack_cachep) {
1953                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1954                 ret = -ENOMEM;
1955                 goto err_cache;
1956         }
1957
1958         net->ct.htable_size = nf_conntrack_htable_size;
1959         net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
1960                                              &net->ct.hash_vmalloc, 1);
1961         if (!net->ct.hash) {
1962                 ret = -ENOMEM;
1963                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1964                 goto err_hash;
1965         }
1966         ret = nf_conntrack_expect_init(net);
1967         if (ret < 0)
1968                 goto err_expect;
1969         ret = nf_conntrack_acct_init(net);
1970         if (ret < 0)
1971                 goto err_acct;
1972         ret = nf_conntrack_ecache_init(net);
1973         if (ret < 0)
1974                 goto err_ecache;
1975
1976         return 0;
1977
1978 err_ecache:
1979         nf_conntrack_acct_fini(net);
1980 err_acct:
1981         nf_conntrack_expect_fini(net);
1982 err_expect:
1983         nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1984                              net->ct.htable_size);
1985 err_hash:
1986         kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1987 err_cache:
1988         kfree(net->ct.slabname);
1989 err_slabname:
1990         free_percpu(net->ct.stat);
1991 err_stat:
1992         return ret;
1993 }
1994
1995 s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
1996                         enum ip_conntrack_dir dir,
1997                         u32 seq);
1998 EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
1999
2000 int nf_conntrack_init(struct net *net)
2001 {
2002         int ret;
2003
2004         if (net_eq(net, &init_net)) {
2005                 ret = nf_conntrack_init_init_net();
2006                 if (ret < 0)
2007                         goto out_init_net;
2008         }
2009         ret = nf_conntrack_init_net(net);
2010         if (ret < 0)
2011                 goto out_net;
2012
2013         if (net_eq(net, &init_net)) {
2014                 /* For use by REJECT target */
2015                 rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
2016                 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
2017
2018                 /* Howto get NAT offsets */
2019                 rcu_assign_pointer(nf_ct_nat_offset, NULL);
2020         }
2021         return 0;
2022
2023 out_net:
2024         if (net_eq(net, &init_net))
2025                 nf_conntrack_cleanup_init_net();
2026 out_init_net:
2027         return ret;
2028 }