GUI: Fix Tomato RAF theme for all builds. Compilation typo.
[tomato.git] / release / src-rt-6.x.4708 / linux / linux-2.6.36 / net / netfilter / nf_conntrack_core.c
bloba8abc11c5d2870731c48d8121b9f8f9bab9bf8d3
1 /* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
2 /* Connection state tracking for netfilter. This is separated from,
3 but required by, the NAT layer; it can also be used by an iptables
4 extension. */
6 /* (C) 1999-2001 Paul `Rusty' Russell
7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
15 #include <linux/types.h>
16 #include <linux/netfilter.h>
17 #include <linux/module.h>
18 #include <linux/sched.h>
19 #include <linux/skbuff.h>
20 #include <linux/proc_fs.h>
21 #include <linux/vmalloc.h>
22 #include <linux/stddef.h>
23 #include <linux/slab.h>
24 #include <linux/random.h>
25 #include <linux/jhash.h>
26 #include <linux/err.h>
27 #include <linux/percpu.h>
28 #include <linux/moduleparam.h>
29 #include <linux/notifier.h>
30 #include <linux/kernel.h>
31 #include <linux/netdevice.h>
32 #include <linux/socket.h>
33 #include <linux/mm.h>
34 #include <linux/nsproxy.h>
35 #include <linux/rculist_nulls.h>
37 #include <net/netfilter/nf_conntrack.h>
38 #include <net/netfilter/nf_conntrack_l3proto.h>
39 #include <net/netfilter/nf_conntrack_l4proto.h>
40 #include <net/netfilter/nf_conntrack_expect.h>
41 #include <net/netfilter/nf_conntrack_helper.h>
42 #include <net/netfilter/nf_conntrack_core.h>
43 #include <net/netfilter/nf_conntrack_extend.h>
44 #include <net/netfilter/nf_conntrack_acct.h>
45 #include <net/netfilter/nf_conntrack_ecache.h>
46 #include <net/netfilter/nf_conntrack_zones.h>
47 #include <net/netfilter/nf_nat.h>
48 #include <net/netfilter/nf_nat_core.h>
50 #define NF_CONNTRACK_VERSION "0.5.0"
52 #ifdef HNDCTF
53 #include <linux/if.h>
54 #include <linux/if_vlan.h>
55 #include <linux/if_pppox.h>
56 #include <linux/in.h>
57 #include <linux/ip.h>
58 #include <linux/tcp.h>
60 #ifdef CONFIG_IPV6
61 #include <linux/ipv6.h>
62 #include <net/ipv6.h>
63 #include <net/ip6_route.h>
64 #define IPVERSION_IS_4(ipver) ((ipver) == 4)
65 #else
66 #define IPVERSION_IS_4(ipver) 1
67 #endif /* CONFIG_IPV6 */
69 #include <net/ip.h>
70 #include <net/route.h>
71 #include <typedefs.h>
72 #include <osl.h>
73 #include <ctf/hndctf.h>
75 #define NFC_CTF_ENABLED (1 << 31)
76 #endif /* HNDCTF */
78 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
79 enum nf_nat_manip_type manip,
80 const struct nlattr *attr) __read_mostly;
81 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
83 DEFINE_SPINLOCK(nf_conntrack_lock);
84 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
86 unsigned int nf_conntrack_htable_size __read_mostly;
87 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
89 unsigned int nf_conntrack_max __read_mostly;
90 EXPORT_SYMBOL_GPL(nf_conntrack_max);
92 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
93 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
95 #ifdef HNDCTF
97 * Display an IP address in readable format.
99 bool
100 ip_conntrack_is_ipc_allowed(struct sk_buff *skb, u_int32_t hooknum)
102 struct net_device *dev;
104 if (!CTF_ENAB(kcih))
105 return FALSE;
107 if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_POST_ROUTING) {
108 dev = skb->dev;
109 if (dev->priv_flags & IFF_802_1Q_VLAN)
110 dev = vlan_dev_real_dev(dev);
112 /* Add ipc entry if packet is received on ctf enabled interface
113 * and the packet is not a defrag'd one.
115 if (ctf_isenabled(kcih, dev) && (skb->len <= dev->mtu))
116 skb->nfcache |= NFC_CTF_ENABLED;
119 /* Add the cache entries only if the device has registered and
120 * enabled ctf.
122 if (skb->nfcache & NFC_CTF_ENABLED)
123 return TRUE;
125 return FALSE;
128 void
129 ip_conntrack_ipct_add(struct sk_buff *skb, u_int32_t hooknum,
130 struct nf_conn *ct, enum ip_conntrack_info ci,
131 struct nf_conntrack_tuple *manip)
133 ctf_ipc_t ipc_entry;
134 struct hh_cache *hh;
135 struct ethhdr *eth;
136 struct iphdr *iph;
137 struct tcphdr *tcph;
138 struct rtable *rt;
139 struct nf_conn_help *help;
140 enum ip_conntrack_dir dir;
141 uint8 ipver, protocol;
142 #ifdef CONFIG_IPV6
143 struct ipv6hdr *ip6h = NULL;
144 #endif /* CONFIG_IPV6 */
145 uint32 nud_flags;
147 if ((skb == NULL) || (ct == NULL))
148 return;
150 /* Check CTF enabled */
151 if (!ip_conntrack_is_ipc_allowed(skb, hooknum))
152 return;
153 /* We only add cache entires for non-helper connections and at
154 * pre or post routing hooks.
156 help = nfct_help(ct);
157 if ((help && help->helper) || (ct->ctf_flags & CTF_FLAGS_EXCLUDED) ||
158 ((hooknum != NF_INET_PRE_ROUTING) && (hooknum != NF_INET_POST_ROUTING)))
159 return;
161 iph = ip_hdr(skb);
162 ipver = iph->version;
164 /* Support both IPv4 and IPv6 */
165 if (ipver == 4) {
166 tcph = ((struct tcphdr *)(((__u8 *)iph) + (iph->ihl << 2)));
167 protocol = iph->protocol;
169 #ifdef CONFIG_IPV6
170 else if (ipver == 6) {
171 ip6h = (struct ipv6hdr *)iph;
172 tcph = (struct tcphdr *)ctf_ipc_lkup_l4proto(kcih, ip6h, &protocol);
173 if (tcph == NULL)
174 return;
176 #endif /* CONFIG_IPV6 */
177 else
178 return;
180 /* Only TCP and UDP are supported */
181 if (protocol == IPPROTO_TCP) {
182 /* Add ipc entries for connections in established state only */
183 if ((ci != IP_CT_ESTABLISHED) && (ci != (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)))
184 return;
186 if (ct->proto.tcp.state >= TCP_CONNTRACK_FIN_WAIT &&
187 ct->proto.tcp.state <= TCP_CONNTRACK_TIME_WAIT)
188 return;
190 else if (protocol != IPPROTO_UDP)
191 return;
193 dir = CTINFO2DIR(ci);
194 if (ct->ctf_flags & (1 << dir))
195 return;
197 /* Do route lookup for alias address if we are doing DNAT in this
198 * direction.
200 if (skb_dst(skb) == NULL) {
201 /* Find the destination interface */
202 if (IPVERSION_IS_4(ipver)) {
203 u_int32_t daddr;
205 if ((manip != NULL) && (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST))
206 daddr = manip->dst.u3.ip;
207 else
208 daddr = iph->daddr;
209 ip_route_input(skb, daddr, iph->saddr, iph->tos, skb->dev);
211 #ifdef CONFIG_IPV6
212 else
213 ip6_route_input(skb);
214 #endif /* CONFIG_IPV6 */
217 /* Ensure the packet belongs to a forwarding connection and it is
218 * destined to an unicast address.
220 rt = (struct rtable *)skb_dst(skb);
222 nud_flags = NUD_PERMANENT | NUD_REACHABLE | NUD_STALE | NUD_DELAY | NUD_PROBE;
223 #ifdef CTF_PPPOE
224 if ((skb_dst(skb) != NULL) && (skb_dst(skb)->dev != NULL) &&
225 (skb_dst(skb)->dev->flags & IFF_POINTOPOINT))
226 nud_flags |= NUD_NOARP;
227 #endif
229 if ((rt == NULL) || (
230 #ifdef CONFIG_IPV6
231 !IPVERSION_IS_4(ipver) ?
232 ((rt->dst.input != ip6_forward) ||
233 !(ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_UNICAST)) :
234 #endif /* CONFIG_IPV6 */
235 ((rt->dst.input != ip_forward) || (rt->rt_type != RTN_UNICAST))) ||
236 (rt->dst.neighbour == NULL) ||
237 ((rt->dst.neighbour->nud_state & nud_flags) == 0))
238 return;
240 memset(&ipc_entry, 0, sizeof(ipc_entry));
242 /* Init the neighboring sender address */
243 memcpy(ipc_entry.sa.octet, eth_hdr(skb)->h_source, ETH_ALEN);
245 /* If the packet is received on a bridge device then save
246 * the bridge cache entry pointer in the ip cache entry.
247 * This will be referenced in the data path to update the
248 * live counter of brc entry whenever a received packet
249 * matches corresponding ipc entry matches.
251 if ((skb->dev != NULL) && ctf_isbridge(kcih, skb->dev))
252 ipc_entry.brcp = ctf_brc_lkup(kcih, eth_hdr(skb)->h_source);
254 hh = skb_dst(skb)->hh;
255 if (hh != NULL) {
256 eth = (struct ethhdr *)(((unsigned char *)hh->hh_data) + 2);
257 memcpy(ipc_entry.dhost.octet, eth->h_dest, ETH_ALEN);
258 memcpy(ipc_entry.shost.octet, eth->h_source, ETH_ALEN);
259 } else {
260 memcpy(ipc_entry.dhost.octet, rt->dst.neighbour->ha, ETH_ALEN);
261 memcpy(ipc_entry.shost.octet, skb_dst(skb)->dev->dev_addr, ETH_ALEN);
264 /* Add ctf ipc entry for this direction */
265 if (IPVERSION_IS_4(ipver)) {
266 ipc_entry.tuple.sip[0] = iph->saddr;
267 ipc_entry.tuple.dip[0] = iph->daddr;
268 #ifdef CONFIG_IPV6
269 } else {
270 memcpy(ipc_entry.tuple.sip, &ip6h->saddr, sizeof(ipc_entry.tuple.sip));
271 memcpy(ipc_entry.tuple.dip, &ip6h->daddr, sizeof(ipc_entry.tuple.dip));
272 #endif /* CONFIG_IPV6 */
274 ipc_entry.tuple.proto = protocol;
275 ipc_entry.tuple.sp = tcph->source;
276 ipc_entry.tuple.dp = tcph->dest;
278 ipc_entry.next = NULL;
280 /* For vlan interfaces fill the vlan id and the tag/untag actions */
282 if(!CTFQOS_ULDL_DIFFIF(kcih)){
283 if (skb_dst(skb)->dev->priv_flags & IFF_802_1Q_VLAN) {
284 ipc_entry.txif = (void *)vlan_dev_real_dev(skb_dst(skb)->dev);
285 ipc_entry.vid = vlan_dev_vlan_id(skb_dst(skb)->dev);
286 ipc_entry.action = ((vlan_dev_vlan_flags(skb_dst(skb)->dev) & 1) ?
287 CTF_ACTION_TAG : CTF_ACTION_UNTAG);
288 } else {
289 ipc_entry.txif = skb_dst(skb)->dev;
290 ipc_entry.action = CTF_ACTION_UNTAG;
293 else{
294 ipc_entry.txif = skb_dst(skb)->dev;
295 ipc_entry.action = CTF_ACTION_UNTAG;
297 #ifdef CTF_PPPOE
298 const char *vars = NULL, *dev_name = NULL;
300 /* For pppoe interfaces fill the session id and header add/del actions */
301 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) {
302 /* Transmit interface and sid will be populated by pppoe module */
303 ipc_entry.ppp_ifp = skb_dst(skb)->dev;
304 dev_name = skb_dst(skb)->dev->name;
305 } else if (skb->dev->flags & IFF_POINTOPOINT) {
306 ipc_entry.ppp_ifp = skb->dev;
307 dev_name = skb->dev->name;
308 } else{
309 ipc_entry.ppp_ifp = NULL;
310 ipc_entry.pppoe_sid = 0xffff;
313 if (ipc_entry.ppp_ifp){
314 struct net_device *pppox_tx_dev=NULL;
315 ctf_ppp_t ctfppp;
318 if (ppp_get_conn_pkt_info(ipc_entry.ppp_ifp,&ctfppp))
319 return;
320 else {
321 if(ctfppp.psk.pppox_protocol == PX_PROTO_OE){
322 if (skb_dst(skb)->dev->flags & IFF_POINTOPOINT) {
323 ipc_entry.action |= CTF_ACTION_PPPOE_ADD;
324 pppox_tx_dev = ctfppp.psk.po->pppoe_dev;
325 memcpy(ipc_entry.dhost.octet, ctfppp.psk.dhost.octet, ETH_ALEN);
326 memcpy(ipc_entry.shost.octet, ctfppp.psk.po->pppoe_dev->dev_addr, ETH_ALEN);
328 else{
329 ipc_entry.action |= CTF_ACTION_PPPOE_DEL;
331 ipc_entry.pppoe_sid = ctfppp.pppox_id;
333 else
334 return;
336 /* For vlan interfaces fill the vlan id and the tag/untag actions */
337 if(pppox_tx_dev){
338 if(!CTFQOS_ULDL_DIFFIF(kcih)){
339 if (pppox_tx_dev ->priv_flags & IFF_802_1Q_VLAN) {
340 ipc_entry.txif = (void *)vlan_dev_real_dev(pppox_tx_dev);
341 ipc_entry.vid = vlan_dev_vlan_id(pppox_tx_dev);
342 ipc_entry.action |= ((vlan_dev_vlan_flags(pppox_tx_dev) & 1) ?
343 CTF_ACTION_TAG : CTF_ACTION_UNTAG);
344 } else {
345 ipc_entry.txif = pppox_tx_dev;
346 ipc_entry.action |= CTF_ACTION_UNTAG;
349 else{
350 ipc_entry.txif = pppox_tx_dev;
351 ipc_entry.action |= CTF_ACTION_UNTAG;
357 #endif /* CTF_PPPOE */
359 if (kcih->ipc_suspend) {
360 /* The default action is suspend */
361 ipc_entry.action |= CTF_ACTION_SUSPEND;
364 /* Copy the DSCP value. ECN bits must be cleared. */
365 if (IPVERSION_IS_4(ipver))
366 ipc_entry.tos = IPV4_TOS(iph);
367 #ifdef CONFIG_IPV6
368 else
369 ipc_entry.tos = IPV6_TRAFFIC_CLASS(ip6h);
370 #endif /* CONFIG_IPV6 */
371 ipc_entry.tos &= IPV4_TOS_DSCP_MASK;
372 if (ipc_entry.tos)
373 ipc_entry.action |= CTF_ACTION_TOS;
375 #ifdef CONFIG_NF_CONNTRACK_MARK
376 /* Initialize the mark for this connection */
377 if (ct->mark != 0) {
378 ipc_entry.mark.value = ct->mark;
379 ipc_entry.action |= CTF_ACTION_MARK;
381 #endif /* CONFIG_NF_CONNTRACK_MARK */
383 /* Update the manip ip and port */
384 if (manip != NULL) {
385 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
386 ipc_entry.nat.ip = manip->src.u3.ip;
387 ipc_entry.nat.port = manip->src.u.tcp.port;
388 ipc_entry.action |= CTF_ACTION_SNAT;
389 } else {
390 ipc_entry.nat.ip = manip->dst.u3.ip;
391 ipc_entry.nat.port = manip->dst.u.tcp.port;
392 ipc_entry.action |= CTF_ACTION_DNAT;
396 /* Do bridge cache lookup to determine outgoing interface
397 * and any vlan tagging actions if needed.
399 if(!CTFQOS_ULDL_DIFFIF(kcih)){
400 if (ctf_isbridge(kcih, ipc_entry.txif)) {
401 ctf_brc_t *brcp;
403 brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet);
405 if (brcp == NULL)
406 return;
407 else {
408 ipc_entry.action |= brcp->action;
409 ipc_entry.txif = brcp->txifp;
410 ipc_entry.vid = brcp->vid;
414 else{
415 if (ctf_isbridge(kcih, ipc_entry.txif)) {
416 ctf_brc_t *brcp;
418 brcp = ctf_brc_lkup(kcih, ipc_entry.dhost.octet);
420 if (brcp == NULL)
421 return;
422 else {
423 ipc_entry.action |= brcp->action;
424 if(brcp->txvifp){
425 ipc_entry.txif = brcp->txvifp;
426 ipc_entry.action &= ~CTF_ACTION_TAG;
427 ipc_entry.action |= CTF_ACTION_UNTAG;
429 else
430 ipc_entry.txif = brcp->txifp;
431 ipc_entry.vid = brcp->vid;
437 #ifdef DEBUG
438 if (IPVERSION_IS_4(ipver))
439 printk("%s: Adding ipc entry for [%d]%u.%u.%u.%u:%u - %u.%u.%u.%u:%u\n", __FUNCTION__,
440 ipc_entry.tuple.proto,
441 NIPQUAD(ipc_entry.tuple.sip[0]), ntohs(ipc_entry.tuple.sp),
442 NIPQUAD(ipc_entry.tuple.dip[0]), ntohs(ipc_entry.tuple.dp));
443 #ifdef CONFIG_IPV6
444 else
445 printk("\n%s: Adding ipc entry for [%d]\n"
446 "%08x.%08x.%08x.%08x:%u => %08x.%08x.%08x.%08x:%u\n",
447 __FUNCTION__, ipc_entry.tuple.proto,
448 ntohl(ipc_entry.tuple.sip[0]), ntohl(ipc_entry.tuple.sip[1]),
449 ntohl(ipc_entry.tuple.sip[2]), ntohl(ipc_entry.tuple.sip[3]),
450 ntohs(ipc_entry.tuple.sp),
451 ntohl(ipc_entry.tuple.dip[0]), ntohl(ipc_entry.tuple.dip[1]),
452 ntohl(ipc_entry.tuple.dip[2]), ntohl(ipc_entry.tuple.dip[3]),
453 ntohs(ipc_entry.tuple.dp));
454 #endif /* CONFIG_IPV6 */
455 printk("sa %02x:%02x:%02x:%02x:%02x:%02x\n",
456 ipc_entry.shost.octet[0], ipc_entry.shost.octet[1],
457 ipc_entry.shost.octet[2], ipc_entry.shost.octet[3],
458 ipc_entry.shost.octet[4], ipc_entry.shost.octet[5]);
459 printk("da %02x:%02x:%02x:%02x:%02x:%02x\n",
460 ipc_entry.dhost.octet[0], ipc_entry.dhost.octet[1],
461 ipc_entry.dhost.octet[2], ipc_entry.dhost.octet[3],
462 ipc_entry.dhost.octet[4], ipc_entry.dhost.octet[5]);
463 printk("[%d] vid: %d action %x\n", hooknum, ipc_entry.vid, ipc_entry.action);
464 if (manip != NULL)
465 printk("manip_ip: %u.%u.%u.%u manip_port %u\n",
466 NIPQUAD(ipc_entry.nat.ip), ntohs(ipc_entry.nat.port));
467 printk("txif: %s\n", ((struct net_device *)ipc_entry.txif)->name);
468 if (ipc_entry.ppp_ifp) printk("pppif: %s\n", ((struct net_device *)ipc_entry.ppp_ifp)->name);
469 #endif
471 ctf_ipc_add(kcih, &ipc_entry, !IPVERSION_IS_4(ipver));
473 /* Update the attributes flag to indicate a CTF conn */
474 ct->ctf_flags |= (CTF_FLAGS_CACHED | (1 << dir));
478 ip_conntrack_ipct_delete(struct nf_conn *ct, int ct_timeout)
480 ctf_ipc_t *ipct;
481 struct nf_conntrack_tuple *orig, *repl;
482 ctf_ipc_t orig_ipct, repl_ipct;
483 int ipaddr_sz;
484 bool v6;
486 if (!CTF_ENAB(kcih))
487 return (0);
489 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
491 if ((orig->dst.protonum != IPPROTO_TCP) && (orig->dst.protonum != IPPROTO_UDP))
492 return (0);
494 repl = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
496 #ifdef CONFIG_IPV6
497 v6 = (orig->src.l3num == AF_INET6);
498 ipaddr_sz = (v6) ? sizeof(struct in6_addr) : sizeof(struct in_addr);
499 #else
500 v6 = FALSE;
501 ipaddr_sz = sizeof(struct in_addr);
502 #endif /* CONFIG_IPV6 */
504 memset(&orig_ipct, 0, sizeof(orig_ipct));
505 memcpy(orig_ipct.tuple.sip, &orig->src.u3.ip, ipaddr_sz);
506 memcpy(orig_ipct.tuple.dip, &orig->dst.u3.ip, ipaddr_sz);
507 orig_ipct.tuple.proto = orig->dst.protonum;
508 orig_ipct.tuple.sp = orig->src.u.tcp.port;
509 orig_ipct.tuple.dp = orig->dst.u.tcp.port;
511 memset(&repl_ipct, 0, sizeof(repl_ipct));
512 memcpy(repl_ipct.tuple.sip, &repl->src.u3.ip, ipaddr_sz);
513 memcpy(repl_ipct.tuple.dip, &repl->dst.u3.ip, ipaddr_sz);
514 repl_ipct.tuple.proto = repl->dst.protonum;
515 repl_ipct.tuple.sp = repl->src.u.tcp.port;
516 repl_ipct.tuple.dp = repl->dst.u.tcp.port;
518 /* If the refresh counter of ipc entry is non zero, it indicates
519 * that the packet transfer is active and we should not delete
520 * the conntrack entry.
522 if (ct_timeout) {
523 ipct = ctf_ipc_lkup(kcih, &orig_ipct, v6);
525 /* Postpone the deletion of ct entry if there are frames
526 * flowing in this direction.
528 if ((ipct != NULL) && (ipct->live > 0)) {
529 ipct->live = 0;
530 ct->timeout.expires = jiffies + ct->expire_jiffies;
531 add_timer(&ct->timeout);
532 return (-1);
535 ipct = ctf_ipc_lkup(kcih, &repl_ipct, v6);
537 if ((ipct != NULL) && (ipct->live > 0)) {
538 ipct->live = 0;
539 ct->timeout.expires = jiffies + ct->expire_jiffies;
540 add_timer(&ct->timeout);
541 return (-1);
545 /* If there are no packets over this connection for timeout period
546 * delete the entries.
548 ctf_ipc_delete(kcih, &orig_ipct, v6);
550 ctf_ipc_delete(kcih, &repl_ipct, v6);
552 #ifdef DEBUG
553 printk("%s: Deleting the tuple %x %x %d %d %d\n",
554 __FUNCTION__, orig->src.u3.ip, orig->dst.u3.ip, orig->dst.protonum,
555 orig->src.u.tcp.port, orig->dst.u.tcp.port);
556 printk("%s: Deleting the tuple %x %x %d %d %d\n",
557 __FUNCTION__, repl->dst.u3.ip, repl->src.u3.ip, repl->dst.protonum,
558 repl->dst.u.tcp.port, repl->src.u.tcp.port);
559 #endif
561 return (0);
563 #endif /* HNDCTF */
566 static int nf_conntrack_hash_rnd_initted;
567 static unsigned int nf_conntrack_hash_rnd;
569 static u_int32_t BCMFASTPATH_HOST __hash_conntrack(const struct nf_conntrack_tuple *tuple,
570 u16 zone, unsigned int size, unsigned int rnd)
572 unsigned int n;
573 u_int32_t h;
575 /* The direction must be ignored, so we hash everything up to the
576 * destination ports (which is a multiple of 4) and treat the last
577 * three bytes manually.
579 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
580 h = jhash2((u32 *)tuple, n,
581 zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
582 tuple->dst.protonum));
584 return ((u64)h * size) >> 32;
587 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
588 const struct nf_conntrack_tuple *tuple)
590 return __hash_conntrack(tuple, zone, net->ct.htable_size,
591 nf_conntrack_hash_rnd);
594 bool
595 nf_ct_get_tuple(const struct sk_buff *skb,
596 unsigned int nhoff,
597 unsigned int dataoff,
598 u_int16_t l3num,
599 u_int8_t protonum,
600 struct nf_conntrack_tuple *tuple,
601 const struct nf_conntrack_l3proto *l3proto,
602 const struct nf_conntrack_l4proto *l4proto)
604 memset(tuple, 0, sizeof(*tuple));
606 tuple->src.l3num = l3num;
607 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
608 return false;
610 tuple->dst.protonum = protonum;
611 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
613 return l4proto->pkt_to_tuple(skb, dataoff, tuple);
615 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
617 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
618 u_int16_t l3num, struct nf_conntrack_tuple *tuple)
620 struct nf_conntrack_l3proto *l3proto;
621 struct nf_conntrack_l4proto *l4proto;
622 unsigned int protoff;
623 u_int8_t protonum;
624 int ret;
626 rcu_read_lock();
628 l3proto = __nf_ct_l3proto_find(l3num);
629 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
630 if (ret != NF_ACCEPT) {
631 rcu_read_unlock();
632 return false;
635 l4proto = __nf_ct_l4proto_find(l3num, protonum);
637 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
638 l3proto, l4proto);
640 rcu_read_unlock();
641 return ret;
643 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
645 bool
646 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
647 const struct nf_conntrack_tuple *orig,
648 const struct nf_conntrack_l3proto *l3proto,
649 const struct nf_conntrack_l4proto *l4proto)
651 memset(inverse, 0, sizeof(*inverse));
653 inverse->src.l3num = orig->src.l3num;
654 if (l3proto->invert_tuple(inverse, orig) == 0)
655 return false;
657 inverse->dst.dir = !orig->dst.dir;
659 inverse->dst.protonum = orig->dst.protonum;
660 return l4proto->invert_tuple(inverse, orig);
662 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
664 static void
665 clean_from_lists(struct nf_conn *ct)
667 pr_debug("clean_from_lists(%p)\n", ct);
668 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
669 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
671 /* Destroy all pending expectations */
672 nf_ct_remove_expectations(ct);
675 static void
676 destroy_conntrack(struct nf_conntrack *nfct)
678 struct nf_conn *ct = (struct nf_conn *)nfct;
679 struct net *net = nf_ct_net(ct);
680 struct nf_conntrack_l4proto *l4proto;
682 pr_debug("destroy_conntrack(%p)\n", ct);
683 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
684 NF_CT_ASSERT(!timer_pending(&ct->timeout));
686 #ifdef HNDCTF
687 ip_conntrack_ipct_delete(ct, 0);
688 #endif /* HNDCTF*/
689 /* To make sure we don't get any weird locking issues here:
690 * destroy_conntrack() MUST NOT be called with a write lock
691 * to nf_conntrack_lock!!! -HW */
692 rcu_read_lock();
693 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
694 if (l4proto && l4proto->destroy)
695 l4proto->destroy(ct);
697 rcu_read_unlock();
699 spin_lock_bh(&nf_conntrack_lock);
700 /* Expectations will have been removed in clean_from_lists,
701 * except TFTP can create an expectation on the first packet,
702 * before connection is in the list, so we need to clean here,
703 * too. */
704 nf_ct_remove_expectations(ct);
706 /* We overload first tuple to link into unconfirmed list. */
707 if (!nf_ct_is_confirmed(ct)) {
708 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
709 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
712 NF_CT_STAT_INC(net, delete);
713 spin_unlock_bh(&nf_conntrack_lock);
715 if (ct->master)
716 nf_ct_put(ct->master);
718 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
719 nf_conntrack_free(ct);
722 void nf_ct_delete_from_lists(struct nf_conn *ct)
724 struct net *net = nf_ct_net(ct);
726 nf_ct_helper_destroy(ct);
727 spin_lock_bh(&nf_conntrack_lock);
728 /* Inside lock so preempt is disabled on module removal path.
729 * Otherwise we can get spurious warnings. */
730 NF_CT_STAT_INC(net, delete_list);
731 clean_from_lists(ct);
732 spin_unlock_bh(&nf_conntrack_lock);
734 EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
736 static void death_by_event(unsigned long ul_conntrack)
738 struct nf_conn *ct = (void *)ul_conntrack;
739 struct net *net = nf_ct_net(ct);
741 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
742 /* bad luck, let's retry again */
743 ct->timeout.expires = jiffies +
744 (random32() % net->ct.sysctl_events_retry_timeout);
745 add_timer(&ct->timeout);
746 return;
748 /* we've got the event delivered, now it's dying */
749 set_bit(IPS_DYING_BIT, &ct->status);
750 spin_lock(&nf_conntrack_lock);
751 hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
752 spin_unlock(&nf_conntrack_lock);
753 nf_ct_put(ct);
756 void nf_ct_insert_dying_list(struct nf_conn *ct)
758 struct net *net = nf_ct_net(ct);
760 /* add this conntrack to the dying list */
761 spin_lock_bh(&nf_conntrack_lock);
762 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
763 &net->ct.dying);
764 spin_unlock_bh(&nf_conntrack_lock);
765 /* set a new timer to retry event delivery */
766 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
767 ct->timeout.expires = jiffies +
768 (random32() % net->ct.sysctl_events_retry_timeout);
769 add_timer(&ct->timeout);
771 EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
773 static void death_by_timeout(unsigned long ul_conntrack)
775 struct nf_conn *ct = (void *)ul_conntrack;
776 #ifdef HNDCTF
777 /* If negative error is returned it means the entry hasn't
778 * timed out yet.
780 if (ip_conntrack_ipct_delete(ct, jiffies >= ct->timeout.expires ? 1 : 0) != 0)
781 return;
782 #endif /* HNDCTF */
784 if (!test_bit(IPS_DYING_BIT, &ct->status) &&
785 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
786 /* destroy event was not delivered */
787 nf_ct_delete_from_lists(ct);
788 nf_ct_insert_dying_list(ct);
789 return;
791 set_bit(IPS_DYING_BIT, &ct->status);
792 nf_ct_delete_from_lists(ct);
793 nf_ct_put(ct);
797 * Warning :
798 * - Caller must take a reference on returned object
799 * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
800 * OR
801 * - Caller must lock nf_conntrack_lock before calling this function
803 struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST
804 __nf_conntrack_find(struct net *net, u16 zone,
805 const struct nf_conntrack_tuple *tuple)
807 struct nf_conntrack_tuple_hash *h;
808 struct hlist_nulls_node *n;
809 unsigned int hash = hash_conntrack(net, zone, tuple);
811 /* Disable BHs the entire time since we normally need to disable them
812 * at least once for the stats anyway.
814 local_bh_disable();
815 begin:
816 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
817 if (nf_ct_tuple_equal(tuple, &h->tuple) &&
818 nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
819 NF_CT_STAT_INC(net, found);
820 local_bh_enable();
821 return h;
823 NF_CT_STAT_INC(net, searched);
826 * if the nulls value we got at the end of this lookup is
827 * not the expected one, we must restart lookup.
828 * We probably met an item that was moved to another chain.
830 if (get_nulls_value(n) != hash) {
831 NF_CT_STAT_INC(net, search_restart);
832 goto begin;
834 local_bh_enable();
836 return NULL;
838 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
840 /* Find a connection corresponding to a tuple. */
841 struct nf_conntrack_tuple_hash * BCMFASTPATH_HOST
842 nf_conntrack_find_get(struct net *net, u16 zone,
843 const struct nf_conntrack_tuple *tuple)
845 struct nf_conntrack_tuple_hash *h;
846 struct nf_conn *ct;
848 rcu_read_lock();
849 begin:
850 h = __nf_conntrack_find(net, zone, tuple);
851 if (h) {
852 ct = nf_ct_tuplehash_to_ctrack(h);
853 if (unlikely(nf_ct_is_dying(ct) ||
854 !atomic_inc_not_zero(&ct->ct_general.use)))
855 h = NULL;
856 else {
857 if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
858 nf_ct_zone(ct) != zone)) {
859 nf_ct_put(ct);
860 goto begin;
864 rcu_read_unlock();
866 return h;
868 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
870 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
871 unsigned int hash,
872 unsigned int repl_hash)
874 struct net *net = nf_ct_net(ct);
876 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
877 &net->ct.hash[hash]);
878 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
879 &net->ct.hash[repl_hash]);
882 void nf_conntrack_hash_insert(struct nf_conn *ct)
884 struct net *net = nf_ct_net(ct);
885 unsigned int hash, repl_hash;
886 u16 zone;
888 zone = nf_ct_zone(ct);
889 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
890 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
892 __nf_conntrack_hash_insert(ct, hash, repl_hash);
894 EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
896 /* Confirm a connection given skb; places it in hash table */
898 __nf_conntrack_confirm(struct sk_buff *skb)
900 unsigned int hash, repl_hash;
901 struct nf_conntrack_tuple_hash *h;
902 struct nf_conn *ct;
903 struct nf_conn_help *help;
904 struct hlist_nulls_node *n;
905 enum ip_conntrack_info ctinfo;
906 struct net *net;
907 u16 zone;
909 ct = nf_ct_get(skb, &ctinfo);
910 net = nf_ct_net(ct);
912 /* ipt_REJECT uses nf_conntrack_attach to attach related
913 ICMP/TCP RST packets in other direction. Actual packet
914 which created connection will be IP_CT_NEW or for an
915 expected connection, IP_CT_RELATED. */
916 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
917 return NF_ACCEPT;
919 zone = nf_ct_zone(ct);
920 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
921 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
923 /* We're not in hash table, and we refuse to set up related
924 connections for unconfirmed conns. But packet copies and
925 REJECT will give spurious warnings here. */
926 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
928 /* No external references means noone else could have
929 confirmed us. */
930 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
931 pr_debug("Confirming conntrack %p\n", ct);
933 spin_lock_bh(&nf_conntrack_lock);
935 /* We have to check the DYING flag inside the lock to prevent
936 a race against nf_ct_get_next_corpse() possibly called from
937 user context, else we insert an already 'dead' hash, blocking
938 further use of that particular connection -JM */
940 if (unlikely(nf_ct_is_dying(ct))) {
941 spin_unlock_bh(&nf_conntrack_lock);
942 return NF_ACCEPT;
945 /* See if there's one in the list already, including reverse:
946 NAT could have grabbed it without realizing, since we're
947 not in the hash. If there is, we lost race. */
948 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
949 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
950 &h->tuple) &&
951 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
952 goto out;
953 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
954 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
955 &h->tuple) &&
956 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
957 goto out;
959 /* Remove from unconfirmed list */
960 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
962 /* Timer relative to confirmation time, not original
963 setting time, otherwise we'd get timer wrap in
964 weird delay cases. */
965 ct->timeout.expires += jiffies;
966 add_timer(&ct->timeout);
967 atomic_inc(&ct->ct_general.use);
968 set_bit(IPS_CONFIRMED_BIT, &ct->status);
970 /* Since the lookup is lockless, hash insertion must be done after
971 * starting the timer and setting the CONFIRMED bit. The RCU barriers
972 * guarantee that no other CPU can find the conntrack before the above
973 * stores are visible.
975 __nf_conntrack_hash_insert(ct, hash, repl_hash);
976 NF_CT_STAT_INC(net, insert);
977 spin_unlock_bh(&nf_conntrack_lock);
979 help = nfct_help(ct);
980 if (help && help->helper)
981 nf_conntrack_event_cache(IPCT_HELPER, ct);
983 nf_conntrack_event_cache(master_ct(ct) ?
984 IPCT_RELATED : IPCT_NEW, ct);
985 return NF_ACCEPT;
987 out:
988 NF_CT_STAT_INC(net, insert_failed);
989 spin_unlock_bh(&nf_conntrack_lock);
990 return NF_DROP;
992 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
994 /* Returns true if a connection correspondings to the tuple (required
995 for NAT). */
997 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
998 const struct nf_conn *ignored_conntrack)
1000 struct net *net = nf_ct_net(ignored_conntrack);
1001 struct nf_conntrack_tuple_hash *h;
1002 struct hlist_nulls_node *n;
1003 struct nf_conn *ct;
1004 u16 zone = nf_ct_zone(ignored_conntrack);
1005 unsigned int hash = hash_conntrack(net, zone, tuple);
1007 /* Disable BHs the entire time since we need to disable them at
1008 * least once for the stats anyway.
1010 rcu_read_lock_bh();
1011 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
1012 ct = nf_ct_tuplehash_to_ctrack(h);
1013 if (ct != ignored_conntrack &&
1014 nf_ct_tuple_equal(tuple, &h->tuple) &&
1015 nf_ct_zone(ct) == zone) {
1016 NF_CT_STAT_INC(net, found);
1017 rcu_read_unlock_bh();
1018 return 1;
1020 NF_CT_STAT_INC(net, searched);
1022 rcu_read_unlock_bh();
1024 return 0;
1026 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1028 #define NF_CT_EVICTION_RANGE 8
1030 /* There's a small race here where we may free a just-assured
1031 connection. Too bad: we're in trouble anyway. */
1032 static noinline int early_drop(struct net *net, unsigned int hash)
1034 /* Use oldest entry, which is roughly LRU */
1035 struct nf_conntrack_tuple_hash *h;
1036 struct nf_conn *ct = NULL, *tmp;
1037 struct hlist_nulls_node *n;
1038 unsigned int i, cnt = 0;
1039 int dropped = 0;
1041 rcu_read_lock();
1042 for (i = 0; i < net->ct.htable_size; i++) {
1043 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
1044 hnnode) {
1045 tmp = nf_ct_tuplehash_to_ctrack(h);
1046 if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
1047 ct = tmp;
1048 cnt++;
1051 if (ct != NULL) {
1052 if (likely(!nf_ct_is_dying(ct) &&
1053 atomic_inc_not_zero(&ct->ct_general.use)))
1054 break;
1055 else
1056 ct = NULL;
1059 if (cnt >= NF_CT_EVICTION_RANGE)
1060 break;
1062 hash = (hash + 1) % net->ct.htable_size;
1064 rcu_read_unlock();
1066 if (!ct)
1067 return dropped;
1069 #ifdef HNDCTF
1070 ip_conntrack_ipct_delete(ct, 0);
1071 #endif /* HNDCTF */
1073 if (del_timer(&ct->timeout)) {
1074 death_by_timeout((unsigned long)ct);
1075 dropped = 1;
1076 NF_CT_STAT_INC_ATOMIC(net, early_drop);
1078 nf_ct_put(ct);
1079 return dropped;
1082 struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
1083 const struct nf_conntrack_tuple *orig,
1084 const struct nf_conntrack_tuple *repl,
1085 gfp_t gfp)
1087 struct nf_conn *ct;
1089 if (unlikely(!nf_conntrack_hash_rnd_initted)) {
1090 get_random_bytes(&nf_conntrack_hash_rnd,
1091 sizeof(nf_conntrack_hash_rnd));
1092 nf_conntrack_hash_rnd_initted = 1;
1095 /* We don't want any race condition at early drop stage */
1096 atomic_inc(&net->ct.count);
1098 if (nf_conntrack_max &&
1099 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1100 unsigned int hash = hash_conntrack(net, zone, orig);
1101 if (!early_drop(net, hash)) {
1102 atomic_dec(&net->ct.count);
1103 if (net_ratelimit())
1104 printk(KERN_WARNING
1105 "nf_conntrack: table full, dropping"
1106 " packet.\n");
1107 return ERR_PTR(-ENOMEM);
1112 * Do not use kmem_cache_zalloc(), as this cache uses
1113 * SLAB_DESTROY_BY_RCU.
1115 ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
1116 if (ct == NULL) {
1117 pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
1118 atomic_dec(&net->ct.count);
1119 return ERR_PTR(-ENOMEM);
1122 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
1123 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
1125 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
1126 sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
1127 spin_lock_init(&ct->lock);
1128 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1129 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1130 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1131 ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
1132 /* Don't set timer yet: wait for confirmation */
1133 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
1134 write_pnet(&ct->ct_net, net);
1135 #ifdef CONFIG_NF_CONNTRACK_ZONES
1136 if (zone) {
1137 struct nf_conntrack_zone *nf_ct_zone;
1139 nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
1140 if (!nf_ct_zone)
1141 goto out_free;
1142 nf_ct_zone->id = zone;
1144 #endif
1146 * changes to lookup keys must be done before setting refcnt to 1
1148 smp_wmb();
1149 atomic_set(&ct->ct_general.use, 1);
1150 return ct;
1152 #ifdef CONFIG_NF_CONNTRACK_ZONES
1153 out_free:
1154 kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
1155 return ERR_PTR(-ENOMEM);
1156 #endif
1158 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1160 void nf_conntrack_free(struct nf_conn *ct)
1162 struct net *net = nf_ct_net(ct);
1164 nf_ct_ext_destroy(ct);
1165 atomic_dec(&net->ct.count);
1166 nf_ct_ext_free(ct);
1167 kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
1169 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1171 /* Allocate a new conntrack: we return -ENOMEM if classification
1172 failed due to stress. Otherwise it really is unclassifiable. */
1173 static struct nf_conntrack_tuple_hash *
1174 init_conntrack(struct net *net, struct nf_conn *tmpl,
1175 const struct nf_conntrack_tuple *tuple,
1176 struct nf_conntrack_l3proto *l3proto,
1177 struct nf_conntrack_l4proto *l4proto,
1178 struct sk_buff *skb,
1179 unsigned int dataoff)
1181 struct nf_conn *ct;
1182 struct nf_conn_help *help;
1183 struct nf_conntrack_tuple repl_tuple;
1184 struct nf_conntrack_ecache *ecache;
1185 struct nf_conntrack_expect *exp;
1186 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
1188 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1189 pr_debug("Can't invert tuple.\n");
1190 return NULL;
1193 ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
1194 if (IS_ERR(ct)) {
1195 pr_debug("Can't allocate conntrack.\n");
1196 return (struct nf_conntrack_tuple_hash *)ct;
1199 if (!l4proto->new(ct, skb, dataoff)) {
1200 nf_conntrack_free(ct);
1201 pr_debug("init conntrack: can't track with proto module\n");
1202 return NULL;
1205 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1207 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1208 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1209 ecache ? ecache->expmask : 0,
1210 GFP_ATOMIC);
1212 spin_lock_bh(&nf_conntrack_lock);
1213 exp = nf_ct_find_expectation(net, zone, tuple);
1214 if (exp) {
1215 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
1216 ct, exp);
1217 /* Welcome, Mr. Bond. We've been expecting you... */
1218 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1219 ct->master = exp->master;
1220 if (exp->helper) {
1221 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1222 if (help)
1223 rcu_assign_pointer(help->helper, exp->helper);
1226 #ifdef CONFIG_NF_CONNTRACK_MARK
1227 ct->mark = exp->master->mark;
1228 #endif
1229 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1230 ct->secmark = exp->master->secmark;
1231 #endif
1232 nf_conntrack_get(&ct->master->ct_general);
1233 NF_CT_STAT_INC(net, expect_new);
1234 } else {
1235 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1236 NF_CT_STAT_INC(net, new);
1239 /* Overload tuple linked list to put us in unconfirmed list. */
1240 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
1241 &net->ct.unconfirmed);
1243 spin_unlock_bh(&nf_conntrack_lock);
1245 if (exp) {
1246 if (exp->expectfn)
1247 exp->expectfn(ct, exp);
1248 nf_ct_expect_put(exp);
1251 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1254 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1255 static inline struct nf_conn *
1256 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1257 struct sk_buff *skb,
1258 unsigned int dataoff,
1259 u_int16_t l3num,
1260 u_int8_t protonum,
1261 struct nf_conntrack_l3proto *l3proto,
1262 struct nf_conntrack_l4proto *l4proto,
1263 int *set_reply,
1264 enum ip_conntrack_info *ctinfo)
1266 struct nf_conntrack_tuple tuple;
1267 struct nf_conntrack_tuple_hash *h;
1268 struct nf_conn *ct;
1269 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
1271 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1272 dataoff, l3num, protonum, &tuple, l3proto,
1273 l4proto)) {
1274 pr_debug("resolve_normal_ct: Can't get tuple\n");
1275 return NULL;
1278 /* look for tuple match */
1279 h = nf_conntrack_find_get(net, zone, &tuple);
1280 if (!h) {
1281 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1282 skb, dataoff);
1283 if (!h)
1284 return NULL;
1285 if (IS_ERR(h))
1286 return (void *)h;
1288 ct = nf_ct_tuplehash_to_ctrack(h);
1290 /* It exists; we have (non-exclusive) reference. */
1291 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1292 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1293 /* Please set reply bit if this packet OK */
1294 *set_reply = 1;
1295 } else {
1296 /* Once we've had two way comms, always ESTABLISHED. */
1297 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1298 pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
1299 *ctinfo = IP_CT_ESTABLISHED;
1300 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1301 pr_debug("nf_conntrack_in: related packet for %p\n",
1302 ct);
1303 *ctinfo = IP_CT_RELATED;
1304 } else {
1305 pr_debug("nf_conntrack_in: new packet for %p\n", ct);
1306 *ctinfo = IP_CT_NEW;
1308 *set_reply = 0;
1310 skb->nfct = &ct->ct_general;
1311 skb->nfctinfo = *ctinfo;
1312 return ct;
1315 unsigned int BCMFASTPATH_HOST
1316 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1317 struct sk_buff *skb)
1319 struct nf_conn *ct, *tmpl = NULL;
1320 enum ip_conntrack_info ctinfo;
1321 struct nf_conntrack_l3proto *l3proto;
1322 struct nf_conntrack_l4proto *l4proto;
1323 unsigned int dataoff;
1324 u_int8_t protonum;
1325 int set_reply = 0;
1326 int ret;
1328 if (skb->nfct) {
1329 /* Previously seen (loopback or untracked)? Ignore. */
1330 tmpl = (struct nf_conn *)skb->nfct;
1331 if (!nf_ct_is_template(tmpl)) {
1332 NF_CT_STAT_INC_ATOMIC(net, ignore);
1333 return NF_ACCEPT;
1335 skb->nfct = NULL;
1338 /* rcu_read_lock()ed by nf_hook_slow */
1339 l3proto = __nf_ct_l3proto_find(pf);
1340 ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1341 &dataoff, &protonum);
1342 if (ret <= 0) {
1343 pr_debug("not prepared to track yet or error occured\n");
1344 NF_CT_STAT_INC_ATOMIC(net, error);
1345 NF_CT_STAT_INC_ATOMIC(net, invalid);
1346 ret = -ret;
1347 goto out;
1350 l4proto = __nf_ct_l4proto_find(pf, protonum);
1352 /* It may be an special packet, error, unclean...
1353 * inverse of the return code tells to the netfilter
1354 * core what to do with the packet. */
1355 if (l4proto->error != NULL) {
1356 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
1357 pf, hooknum);
1358 if (ret <= 0) {
1359 NF_CT_STAT_INC_ATOMIC(net, error);
1360 NF_CT_STAT_INC_ATOMIC(net, invalid);
1361 ret = -ret;
1362 goto out;
1366 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1367 l3proto, l4proto, &set_reply, &ctinfo);
1368 if (!ct) {
1369 /* Not valid part of a connection */
1370 NF_CT_STAT_INC_ATOMIC(net, invalid);
1371 ret = NF_ACCEPT;
1372 goto out;
1375 if (IS_ERR(ct)) {
1376 /* Too stressed to deal. */
1377 NF_CT_STAT_INC_ATOMIC(net, drop);
1378 ret = NF_DROP;
1379 goto out;
1382 NF_CT_ASSERT(skb->nfct);
1384 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
1385 if (ret <= 0) {
1386 /* Invalid: inverse of the return code tells
1387 * the netfilter core what to do */
1388 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1389 nf_conntrack_put(skb->nfct);
1390 skb->nfct = NULL;
1391 NF_CT_STAT_INC_ATOMIC(net, invalid);
1392 if (ret == -NF_DROP)
1393 NF_CT_STAT_INC_ATOMIC(net, drop);
1394 ret = -ret;
1395 goto out;
1398 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1399 nf_conntrack_event_cache(IPCT_REPLY, ct);
1400 out:
1401 if (tmpl)
1402 nf_ct_put(tmpl);
1404 return ret;
1406 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1408 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1409 const struct nf_conntrack_tuple *orig)
1411 bool ret;
1413 rcu_read_lock();
1414 ret = nf_ct_invert_tuple(inverse, orig,
1415 __nf_ct_l3proto_find(orig->src.l3num),
1416 __nf_ct_l4proto_find(orig->src.l3num,
1417 orig->dst.protonum));
1418 rcu_read_unlock();
1419 return ret;
1421 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1423 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1424 implicitly racy: see __nf_conntrack_confirm */
1425 void nf_conntrack_alter_reply(struct nf_conn *ct,
1426 const struct nf_conntrack_tuple *newreply)
1428 struct nf_conn_help *help = nfct_help(ct);
1430 /* Should be unconfirmed, so not in hash table yet */
1431 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1433 pr_debug("Altering reply tuple of %p to ", ct);
1434 nf_ct_dump_tuple(newreply);
1436 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1437 if (ct->master || (help && !hlist_empty(&help->expectations)))
1438 return;
1440 rcu_read_lock();
1441 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1442 rcu_read_unlock();
1444 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1446 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1447 void __nf_ct_refresh_acct(struct nf_conn *ct,
1448 enum ip_conntrack_info ctinfo,
1449 const struct sk_buff *skb,
1450 unsigned long extra_jiffies,
1451 int do_acct)
1453 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1454 NF_CT_ASSERT(skb);
1456 /* Only update if this is not a fixed timeout */
1457 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1458 goto acct;
1460 /* If not in hash table, timer will not be active yet */
1461 if (!nf_ct_is_confirmed(ct)) {
1462 #ifdef HNDCTF
1463 ct->expire_jiffies = extra_jiffies;
1464 #endif /* HNDCTF */
1465 ct->timeout.expires = extra_jiffies;
1466 } else {
1467 unsigned long newtime = jiffies + extra_jiffies;
1469 /* Only update the timeout if the new timeout is at least
1470 HZ jiffies from the old timeout. Need del_timer for race
1471 avoidance (may already be dying). */
1472 if (newtime - ct->timeout.expires >= HZ)
1473 #ifdef HNDCTF
1474 ct->expire_jiffies = extra_jiffies;
1475 #endif /* HNDCTF */
1476 mod_timer_pending(&ct->timeout, newtime);
1479 acct:
1480 if (do_acct) {
1481 struct nf_conn_counter *acct;
1483 acct = nf_conn_acct_find(ct);
1484 if (acct) {
1485 spin_lock_bh(&ct->lock);
1486 acct[CTINFO2DIR(ctinfo)].packets++;
1487 acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
1488 spin_unlock_bh(&ct->lock);
1492 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1494 bool __nf_ct_kill_acct(struct nf_conn *ct,
1495 enum ip_conntrack_info ctinfo,
1496 const struct sk_buff *skb,
1497 int do_acct)
1499 if (do_acct) {
1500 struct nf_conn_counter *acct;
1502 acct = nf_conn_acct_find(ct);
1503 if (acct) {
1504 spin_lock_bh(&ct->lock);
1505 acct[CTINFO2DIR(ctinfo)].packets++;
1506 acct[CTINFO2DIR(ctinfo)].bytes +=
1507 skb->len - skb_network_offset(skb);
1508 spin_unlock_bh(&ct->lock);
1512 if (del_timer(&ct->timeout)) {
1513 ct->timeout.function((unsigned long)ct);
1514 return true;
1516 return false;
1518 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
1520 #ifdef CONFIG_NF_CONNTRACK_ZONES
1521 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
1522 .len = sizeof(struct nf_conntrack_zone),
1523 .align = __alignof__(struct nf_conntrack_zone),
1524 .id = NF_CT_EXT_ZONE,
1526 #endif
1528 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1530 #include <linux/netfilter/nfnetlink.h>
1531 #include <linux/netfilter/nfnetlink_conntrack.h>
1532 #include <linux/mutex.h>
1534 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1535 * in ip_conntrack_core, since we don't want the protocols to autoload
1536 * or depend on ctnetlink */
1537 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1538 const struct nf_conntrack_tuple *tuple)
1540 NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
1541 NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
1542 return 0;
1544 nla_put_failure:
1545 return -1;
1547 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1549 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1550 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
1551 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
1553 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1555 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1556 struct nf_conntrack_tuple *t)
1558 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1559 return -EINVAL;
1561 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1562 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1564 return 0;
1566 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1568 int nf_ct_port_nlattr_tuple_size(void)
1570 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1572 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1573 #endif
1575 /* Used by ipt_REJECT and ip6t_REJECT. */
1576 static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1578 struct nf_conn *ct;
1579 enum ip_conntrack_info ctinfo;
1581 /* This ICMP is in reverse direction to the packet which caused it */
1582 ct = nf_ct_get(skb, &ctinfo);
1583 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1584 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1585 else
1586 ctinfo = IP_CT_RELATED;
1588 /* Attach to new skbuff, and increment count */
1589 nskb->nfct = &ct->ct_general;
1590 nskb->nfctinfo = ctinfo;
1591 nf_conntrack_get(nskb->nfct);
1594 /* Bring out ya dead! */
1595 static struct nf_conn *
1596 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1597 void *data, unsigned int *bucket)
1599 struct nf_conntrack_tuple_hash *h;
1600 struct nf_conn *ct;
1601 struct hlist_nulls_node *n;
1603 spin_lock_bh(&nf_conntrack_lock);
1604 for (; *bucket < net->ct.htable_size; (*bucket)++) {
1605 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
1606 ct = nf_ct_tuplehash_to_ctrack(h);
1607 if (iter(ct, data))
1608 goto found;
1611 hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
1612 ct = nf_ct_tuplehash_to_ctrack(h);
1613 if (iter(ct, data))
1614 set_bit(IPS_DYING_BIT, &ct->status);
1616 spin_unlock_bh(&nf_conntrack_lock);
1617 return NULL;
1618 found:
1619 atomic_inc(&ct->ct_general.use);
1620 spin_unlock_bh(&nf_conntrack_lock);
1621 return ct;
1624 void nf_ct_iterate_cleanup(struct net *net,
1625 int (*iter)(struct nf_conn *i, void *data),
1626 void *data)
1628 struct nf_conn *ct;
1629 unsigned int bucket = 0;
1631 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1632 #ifdef HNDCTF
1633 ip_conntrack_ipct_delete(ct, 0);
1634 #endif /* HNDCTF */
1635 /* Time to push up daises... */
1636 if (del_timer(&ct->timeout))
1637 death_by_timeout((unsigned long)ct);
1638 /* ... else the timer will get him soon. */
1640 nf_ct_put(ct);
1643 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1645 struct __nf_ct_flush_report {
1646 u32 pid;
1647 int report;
1650 static int kill_report(struct nf_conn *i, void *data)
1652 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
1654 /* If we fail to deliver the event, death_by_timeout() will retry */
1655 if (nf_conntrack_event_report(IPCT_DESTROY, i,
1656 fr->pid, fr->report) < 0)
1657 return 1;
1659 /* Avoid the delivery of the destroy event in death_by_timeout(). */
1660 set_bit(IPS_DYING_BIT, &i->status);
1661 return 1;
1664 static int kill_all(struct nf_conn *i, void *data)
1666 return 1;
1669 void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
1671 if (vmalloced)
1672 vfree(hash);
1673 else
1674 free_pages((unsigned long)hash,
1675 get_order(sizeof(struct hlist_head) * size));
1677 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1679 void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
1681 struct __nf_ct_flush_report fr = {
1682 .pid = pid,
1683 .report = report,
1685 nf_ct_iterate_cleanup(net, kill_report, &fr);
1687 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
1689 static void nf_ct_release_dying_list(struct net *net)
1691 struct nf_conntrack_tuple_hash *h;
1692 struct nf_conn *ct;
1693 struct hlist_nulls_node *n;
1695 spin_lock_bh(&nf_conntrack_lock);
1696 hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
1697 ct = nf_ct_tuplehash_to_ctrack(h);
1698 /* never fails to remove them, no listeners at this point */
1699 nf_ct_kill(ct);
1701 spin_unlock_bh(&nf_conntrack_lock);
1704 static int untrack_refs(void)
1706 int cnt = 0, cpu;
1708 for_each_possible_cpu(cpu) {
1709 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1711 cnt += atomic_read(&ct->ct_general.use) - 1;
1713 return cnt;
1716 static void nf_conntrack_cleanup_init_net(void)
1718 while (untrack_refs() > 0)
1719 schedule();
1721 nf_conntrack_helper_fini();
1722 nf_conntrack_proto_fini();
1723 #ifdef CONFIG_NF_CONNTRACK_ZONES
1724 nf_ct_extend_unregister(&nf_ct_zone_extend);
1725 #endif
1728 static void nf_conntrack_cleanup_net(struct net *net)
1730 i_see_dead_people:
1731 nf_ct_iterate_cleanup(net, kill_all, NULL);
1732 nf_ct_release_dying_list(net);
1733 if (atomic_read(&net->ct.count) != 0) {
1734 schedule();
1735 goto i_see_dead_people;
1738 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1739 net->ct.htable_size);
1740 nf_conntrack_ecache_fini(net);
1741 nf_conntrack_acct_fini(net);
1742 nf_conntrack_expect_fini(net);
1743 kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1744 kfree(net->ct.slabname);
1745 free_percpu(net->ct.stat);
1748 /* Mishearing the voices in his head, our hero wonders how he's
1749 supposed to kill the mall. */
1750 void nf_conntrack_cleanup(struct net *net)
1752 if (net_eq(net, &init_net))
1753 rcu_assign_pointer(ip_ct_attach, NULL);
1755 /* This makes sure all current packets have passed through
1756 netfilter framework. Roll on, two-stage module
1757 delete... */
1758 synchronize_net();
1760 nf_conntrack_cleanup_net(net);
1762 if (net_eq(net, &init_net)) {
1763 rcu_assign_pointer(nf_ct_destroy, NULL);
1764 nf_conntrack_cleanup_init_net();
1768 void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
1770 struct hlist_nulls_head *hash;
1771 unsigned int nr_slots, i;
1772 size_t sz;
1774 *vmalloced = 0;
1776 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1777 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1778 sz = nr_slots * sizeof(struct hlist_nulls_head);
1779 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1780 get_order(sz));
1781 if (!hash) {
1782 *vmalloced = 1;
1783 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1784 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1785 PAGE_KERNEL);
1788 if (hash && nulls)
1789 for (i = 0; i < nr_slots; i++)
1790 INIT_HLIST_NULLS_HEAD(&hash[i], i);
1792 return hash;
1794 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1796 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1798 int i, bucket, vmalloced, old_vmalloced;
1799 unsigned int hashsize, old_size;
1800 struct hlist_nulls_head *hash, *old_hash;
1801 struct nf_conntrack_tuple_hash *h;
1802 struct nf_conn *ct;
1804 if (current->nsproxy->net_ns != &init_net)
1805 return -EOPNOTSUPP;
1807 /* On boot, we can set this without any fancy locking. */
1808 if (!nf_conntrack_htable_size)
1809 return param_set_uint(val, kp);
1811 hashsize = simple_strtoul(val, NULL, 0);
1812 if (!hashsize)
1813 return -EINVAL;
1815 hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
1816 if (!hash)
1817 return -ENOMEM;
1819 /* Lookups in the old hash might happen in parallel, which means we
1820 * might get false negatives during connection lookup. New connections
1821 * created because of a false negative won't make it into the hash
1822 * though since that required taking the lock.
1824 spin_lock_bh(&nf_conntrack_lock);
1825 for (i = 0; i < init_net.ct.htable_size; i++) {
1826 while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
1827 h = hlist_nulls_entry(init_net.ct.hash[i].first,
1828 struct nf_conntrack_tuple_hash, hnnode);
1829 ct = nf_ct_tuplehash_to_ctrack(h);
1830 hlist_nulls_del_rcu(&h->hnnode);
1831 bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
1832 hashsize,
1833 nf_conntrack_hash_rnd);
1834 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1837 old_size = init_net.ct.htable_size;
1838 old_vmalloced = init_net.ct.hash_vmalloc;
1839 old_hash = init_net.ct.hash;
1841 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1842 init_net.ct.hash_vmalloc = vmalloced;
1843 init_net.ct.hash = hash;
1844 spin_unlock_bh(&nf_conntrack_lock);
1846 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
1847 return 0;
1849 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1851 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1852 &nf_conntrack_htable_size, 0600);
1854 void nf_ct_untracked_status_or(unsigned long bits)
1856 int cpu;
1858 for_each_possible_cpu(cpu)
1859 per_cpu(nf_conntrack_untracked, cpu).status |= bits;
1861 EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1863 static int nf_conntrack_init_init_net(void)
1865 int max_factor = 8;
1866 int ret, cpu;
1868 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1869 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
1870 if (!nf_conntrack_htable_size) {
1871 nf_conntrack_htable_size
1872 = (((totalram_pages << PAGE_SHIFT) / 16384)
1873 / sizeof(struct hlist_head));
1874 if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1875 nf_conntrack_htable_size = 16384;
1876 if (nf_conntrack_htable_size < 32)
1877 nf_conntrack_htable_size = 32;
1879 /* Use a max. factor of four by default to get the same max as
1880 * with the old struct list_heads. When a table size is given
1881 * we use the old value of 8 to avoid reducing the max.
1882 * entries. */
1883 max_factor = 4;
1885 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1887 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
1888 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1889 nf_conntrack_max);
1891 ret = nf_conntrack_proto_init();
1892 if (ret < 0)
1893 goto err_proto;
1895 ret = nf_conntrack_helper_init();
1896 if (ret < 0)
1897 goto err_helper;
1899 #ifdef CONFIG_NF_CONNTRACK_ZONES
1900 ret = nf_ct_extend_register(&nf_ct_zone_extend);
1901 if (ret < 0)
1902 goto err_extend;
1903 #endif
1904 /* Set up fake conntrack: to never be deleted, not in any hashes */
1905 for_each_possible_cpu(cpu) {
1906 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1907 write_pnet(&ct->ct_net, &init_net);
1908 atomic_set(&ct->ct_general.use, 1);
1910 /* - and look it like as a confirmed connection */
1911 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1912 return 0;
1914 #ifdef CONFIG_NF_CONNTRACK_ZONES
1915 err_extend:
1916 nf_conntrack_helper_fini();
1917 #endif
1918 err_helper:
1919 nf_conntrack_proto_fini();
1920 err_proto:
1921 return ret;
1925 * We need to use special "null" values, not used in hash table
1927 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
1928 #define DYING_NULLS_VAL ((1<<30)+1)
1930 static int nf_conntrack_init_net(struct net *net)
1932 int ret;
1934 atomic_set(&net->ct.count, 0);
1935 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
1936 INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
1937 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1938 if (!net->ct.stat) {
1939 ret = -ENOMEM;
1940 goto err_stat;
1943 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
1944 if (!net->ct.slabname) {
1945 ret = -ENOMEM;
1946 goto err_slabname;
1949 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
1950 sizeof(struct nf_conn), 0,
1951 SLAB_DESTROY_BY_RCU, NULL);
1952 if (!net->ct.nf_conntrack_cachep) {
1953 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1954 ret = -ENOMEM;
1955 goto err_cache;
1958 net->ct.htable_size = nf_conntrack_htable_size;
1959 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
1960 &net->ct.hash_vmalloc, 1);
1961 if (!net->ct.hash) {
1962 ret = -ENOMEM;
1963 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1964 goto err_hash;
1966 ret = nf_conntrack_expect_init(net);
1967 if (ret < 0)
1968 goto err_expect;
1969 ret = nf_conntrack_acct_init(net);
1970 if (ret < 0)
1971 goto err_acct;
1972 ret = nf_conntrack_ecache_init(net);
1973 if (ret < 0)
1974 goto err_ecache;
1976 return 0;
1978 err_ecache:
1979 nf_conntrack_acct_fini(net);
1980 err_acct:
1981 nf_conntrack_expect_fini(net);
1982 err_expect:
1983 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1984 net->ct.htable_size);
1985 err_hash:
1986 kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1987 err_cache:
1988 kfree(net->ct.slabname);
1989 err_slabname:
1990 free_percpu(net->ct.stat);
1991 err_stat:
1992 return ret;
1995 s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
1996 enum ip_conntrack_dir dir,
1997 u32 seq);
1998 EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
2000 int nf_conntrack_init(struct net *net)
2002 int ret;
2004 if (net_eq(net, &init_net)) {
2005 ret = nf_conntrack_init_init_net();
2006 if (ret < 0)
2007 goto out_init_net;
2009 ret = nf_conntrack_init_net(net);
2010 if (ret < 0)
2011 goto out_net;
2013 if (net_eq(net, &init_net)) {
2014 /* For use by REJECT target */
2015 rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
2016 rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
2018 /* Howto get NAT offsets */
2019 rcu_assign_pointer(nf_ct_nat_offset, NULL);
2021 return 0;
2023 out_net:
2024 if (net_eq(net, &init_net))
2025 nf_conntrack_cleanup_init_net();
2026 out_init_net:
2027 return ret;