sbin/fsck_hammer2: Fix destination FILE* in print_media()
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
blobf047fe0254eac7bb005eb493cb54f2b2af9415ad
1 /*
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
25 * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
29 * Implement IP packet firewall (new version)
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
59 #include <sys/thread2.h>
60 #include <net/netmsg2.h>
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in_var.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #include <netinet/udp.h>
75 #include <netinet/udp_var.h>
76 #include <netinet/ip_divert.h>
77 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79 #include <net/ipfw/ip_fw2.h>
81 #ifdef IPFIREWALL_DEBUG
82 #define DPRINTF(fmt, ...) \
83 do { \
84 if (fw_debug > 0) \
85 kprintf(fmt, __VA_ARGS__); \
86 } while (0)
87 #else
88 #define DPRINTF(fmt, ...) ((void)0)
89 #endif
92 * Description about per-CPU rule duplication:
94 * Module loading/unloading and all ioctl operations are serialized
95 * by netisr0, so we don't have any ordering or locking problems.
97 * Following graph shows how operation on per-CPU rule list is
98 * performed [2 CPU case]:
100 * CPU0 CPU1
102 * netisr0 <------------------------------------+
103 * domsg |
104 * : |
105 * :(delete/add...) |
106 * : |
107 * : netmsg | netmsg
108 * forwardmsg---------->netisr1 |
109 * : |
110 * :(delete/add...) |
111 * : |
112 * : |
113 * replymsg--------------+
117 * Rule structure [2 CPU case]
119 * CPU0 CPU1
121 * layer3_chain layer3_chain
122 * | |
123 * V V
124 * +-------+ sibling +-------+ sibling
125 * | rule1 |--------->| rule1 |--------->NULL
126 * +-------+ +-------+
127 * | |
128 * |next |next
129 * V V
130 * +-------+ sibling +-------+ sibling
131 * | rule2 |--------->| rule2 |--------->NULL
132 * +-------+ +-------+
134 * ip_fw.sibling:
135 * 1) Ease statistics calculation during IP_FW_GET. We only need to
136 * iterate layer3_chain in netisr0; the current rule's duplication
137 * to the other CPUs could safely be read-only accessed through
138 * ip_fw.sibling.
139 * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
140 * a) In netisr0 rule3 is determined to be inserted between rule1
141 * and rule2. To make this decision we need to iterate the
142 * layer3_chain in netisr0. The netmsg, which is used to insert
143 * the rule, will contain rule1 in netisr0 as prev_rule and rule2
144 * in netisr0 as next_rule.
145 * b) After the insertion in netisr0 is done, we will move on to
146 * netisr1. But instead of relocating the rule3's position in
147 * netisr1 by iterating the layer3_chain in netisr1, we set the
148 * netmsg's prev_rule to rule1->sibling and next_rule to
149 * rule2->sibling before the netmsg is forwarded to netisr1 from
150 * netisr0.
154 * Description of states and tracks.
156 * Both states and tracks are stored in per-cpu RB trees instead of
157 * per-cpu hash tables to avoid the worst case hash degeneration.
159 * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
160 * measured in seconds and depending on the flags.
162 * When a packet is received, its address fields are first masked with
163 * the mask defined for the rule, then matched against the entries in
164 * the per-cpu state RB tree. States are generated by 'keep-state'
165 * and 'limit' options.
167 * The max number of states is ipfw_state_max. When we reach the
168 * maximum number of states we do not create anymore. This is done to
169 * avoid consuming too much memory, but also too much time when
170 * searching on each packet.
172 * Each state holds a pointer to the parent ipfw rule of the current
173 * CPU so we know what action to perform. States are removed when the
174 * parent rule is deleted. XXX we should make them survive.
176 * There are some limitations with states -- we do not obey the
177 * 'randomized match', and we do not do multiple passes through the
178 * firewall. XXX check the latter!!!
180 * States grow independently on each CPU, e.g. 2 CPU case:
182 * CPU0 CPU1
183 * ................... ...................
184 * : state RB tree : : state RB tree :
185 * : : : :
186 * : state1 state2 : : state3 :
187 * : | | : : | :
188 * :.....|....|......: :........|........:
189 * | | |
190 * | | |st_rule
191 * | | |
192 * V V V
193 * +-------+ +-------+
194 * | rule1 | | rule1 |
195 * +-------+ +-------+
197 * Tracks are used to enforce limits on the number of sessions. Tracks
198 * are generated by 'limit' option.
200 * The max number of tracks is ipfw_track_max. When we reach the
201 * maximum number of tracks we do not create anymore. This is done to
202 * avoid consuming too much memory.
204 * Tracks are organized into two layers, track counter RB tree is
205 * shared between CPUs, track RB tree is per-cpu. States generated by
206 * 'limit' option are linked to the track in addition to the per-cpu
207 * state RB tree; mainly to ease expiration. e.g. 2 CPU case:
209 * ..............................
210 * : track counter RB tree :
211 * : :
212 * : +-----------+ :
213 * : | trkcnt1 | :
214 * : | | :
215 * : +--->counter<----+ :
216 * : | | | | :
217 * : | +-----------+ | :
218 * :......|................|....:
219 * | |
220 * CPU0 | | CPU1
221 * ................. |t_count | .................
222 * : track RB tree : | | : track RB tree :
223 * : : | | : :
224 * : +-->track1-------+ +--------track2 :
225 * : | A : : :
226 * : | | : : :
227 * :.|.....|.......: :...............:
228 * | +----------------+
229 * | .................... |
230 * | : state RB tree : |st_track
231 * | : : |
232 * +---state1 state2---+
233 * : | | :
234 * :.....|.......|....:
235 * | |
236 * | |st_rule
237 * V V
238 * +----------+
239 * | rule1 |
240 * +----------+
243 #define IPFW_AUTOINC_STEP_MIN 1
244 #define IPFW_AUTOINC_STEP_MAX 1000
245 #define IPFW_AUTOINC_STEP_DEF 100
247 #define IPFW_TABLE_MAX_DEF 64
249 #define IPFW_DEFAULT_RULE 65535 /* rulenum for the default rule */
250 #define IPFW_DEFAULT_SET 31 /* set number for the default rule */
252 #define MATCH_REVERSE 0
253 #define MATCH_FORWARD 1
254 #define MATCH_NONE 2
255 #define MATCH_UNKNOWN 3
257 #define TIME_LEQ(a, b) ((a) - (b) <= 0)
259 #define IPFW_STATE_TCPFLAGS (TH_SYN | TH_FIN | TH_RST)
260 #define IPFW_STATE_TCPSTATES (IPFW_STATE_TCPFLAGS | \
261 (IPFW_STATE_TCPFLAGS << 8))
263 #define BOTH_SYN (TH_SYN | (TH_SYN << 8))
264 #define BOTH_FIN (TH_FIN | (TH_FIN << 8))
265 #define BOTH_RST (TH_RST | (TH_RST << 8))
266 /* TH_ACK here means FIN was ACKed. */
267 #define BOTH_FINACK (TH_ACK | (TH_ACK << 8))
269 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP && \
270 (((s)->st_state & BOTH_RST) || \
271 ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
273 #define O_ANCHOR O_NOP
275 #define IPFW_ISXLAT(type) ((type) == O_REDIRECT)
276 #define IPFW_XLAT_INVALID(s) (IPFW_ISXLAT((s)->st_type) && \
277 ((struct ipfw_xlat *)(s))->xlat_invalid)
279 #define IPFW_MBUF_XLATINS FW_MBUF_PRIVATE1
280 #define IPFW_MBUF_XLATFWD FW_MBUF_PRIVATE2
282 #define IPFW_XLATE_INSERT 0x0001
283 #define IPFW_XLATE_FORWARD 0x0002
284 #define IPFW_XLATE_OUTPUT 0x0004
286 struct netmsg_ipfw {
287 struct netmsg_base base;
288 const struct ipfw_ioc_rule *ioc_rule;
289 struct ip_fw *next_rule;
290 struct ip_fw *prev_rule;
291 struct ip_fw *sibling;
292 uint32_t rule_flags;
293 struct ip_fw **cross_rules;
296 struct netmsg_del {
297 struct netmsg_base base;
298 struct ip_fw *start_rule;
299 struct ip_fw *prev_rule;
300 uint16_t rulenum;
301 uint8_t from_set;
302 uint8_t to_set;
305 struct netmsg_zent {
306 struct netmsg_base base;
307 struct ip_fw *start_rule;
308 uint16_t rulenum;
309 uint16_t log_only;
312 struct netmsg_cpstate {
313 struct netmsg_base base;
314 struct ipfw_ioc_state *ioc_state;
315 int state_cntmax;
316 int state_cnt;
319 struct netmsg_tblent {
320 struct netmsg_base base;
321 struct sockaddr *key;
322 struct sockaddr *netmask;
323 struct ipfw_tblent *sibling;
324 int tableid;
327 struct netmsg_tblflush {
328 struct netmsg_base base;
329 int tableid;
330 int destroy;
333 struct netmsg_tblexp {
334 struct netmsg_base base;
335 time_t expire;
336 int tableid;
337 int cnt;
338 int expcnt;
339 struct radix_node_head *rnh;
342 struct ipfw_table_cp {
343 struct ipfw_ioc_tblent *te;
344 int te_idx;
345 int te_cnt;
348 struct ip_fw_local {
350 * offset The offset of a fragment. offset != 0 means that
351 * we have a fragment at this offset of an IPv4 packet.
352 * offset == 0 means that (if this is an IPv4 packet)
353 * this is the first or only fragment.
355 u_short offset;
358 * Local copies of addresses. They are only valid if we have
359 * an IP packet.
361 * proto The protocol. Set to 0 for non-ip packets,
362 * or to the protocol read from the packet otherwise.
363 * proto != 0 means that we have an IPv4 packet.
365 * src_port, dst_port port numbers, in HOST format. Only
366 * valid for TCP and UDP packets.
368 * src_ip, dst_ip ip addresses, in NETWORK format.
369 * Only valid for IPv4 packets.
371 uint8_t proto;
372 uint16_t src_port; /* NOTE: host format */
373 uint16_t dst_port; /* NOTE: host format */
374 struct in_addr src_ip; /* NOTE: network format */
375 struct in_addr dst_ip; /* NOTE: network format */
376 uint16_t ip_len; /* NOTE: host format */
377 struct tcphdr *tcp;
380 struct ipfw_addrs {
381 uint32_t addr1; /* host byte order */
382 uint32_t addr2; /* host byte order */
385 struct ipfw_ports {
386 uint16_t port1; /* host byte order */
387 uint16_t port2; /* host byte order */
390 struct ipfw_key {
391 union {
392 struct ipfw_addrs addrs;
393 uint64_t value;
394 } addr_u;
395 union {
396 struct ipfw_ports ports;
397 uint32_t value;
398 } port_u;
399 uint8_t proto;
400 uint8_t swap; /* IPFW_KEY_SWAP_ */
401 uint16_t rsvd2;
404 #define IPFW_KEY_SWAP_ADDRS 0x1
405 #define IPFW_KEY_SWAP_PORTS 0x2
406 #define IPFW_KEY_SWAP_ALL (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
408 struct ipfw_trkcnt {
409 RB_ENTRY(ipfw_trkcnt) tc_rblink;
410 struct ipfw_key tc_key;
411 uintptr_t tc_ruleid;
412 int tc_refs;
413 int tc_count;
414 time_t tc_expire; /* userland get-only */
415 uint16_t tc_rulenum; /* userland get-only */
416 } __cachealign;
418 #define tc_addrs tc_key.addr_u.value
419 #define tc_ports tc_key.port_u.value
420 #define tc_proto tc_key.proto
421 #define tc_saddr tc_key.addr_u.addrs.addr1
422 #define tc_daddr tc_key.addr_u.addrs.addr2
423 #define tc_sport tc_key.port_u.ports.port1
424 #define tc_dport tc_key.port_u.ports.port2
426 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
428 struct ipfw_state;
430 struct ipfw_track {
431 RB_ENTRY(ipfw_track) t_rblink;
432 struct ipfw_key t_key;
433 struct ip_fw *t_rule;
434 time_t t_lastexp;
435 LIST_HEAD(, ipfw_state) t_state_list;
436 time_t t_expire;
437 volatile int *t_count;
438 struct ipfw_trkcnt *t_trkcnt;
439 TAILQ_ENTRY(ipfw_track) t_link;
442 #define t_addrs t_key.addr_u.value
443 #define t_ports t_key.port_u.value
444 #define t_proto t_key.proto
445 #define t_saddr t_key.addr_u.addrs.addr1
446 #define t_daddr t_key.addr_u.addrs.addr2
447 #define t_sport t_key.port_u.ports.port1
448 #define t_dport t_key.port_u.ports.port2
450 RB_HEAD(ipfw_track_tree, ipfw_track);
451 TAILQ_HEAD(ipfw_track_list, ipfw_track);
453 struct ipfw_state {
454 RB_ENTRY(ipfw_state) st_rblink;
455 struct ipfw_key st_key;
457 time_t st_expire; /* expire time */
458 struct ip_fw *st_rule;
460 uint64_t st_pcnt; /* packets */
461 uint64_t st_bcnt; /* bytes */
464 * st_state:
465 * State of this rule, typically a combination of TCP flags.
467 * st_ack_fwd/st_ack_rev:
468 * Most recent ACKs in forward and reverse direction. They
469 * are used to generate keepalives.
471 uint32_t st_state;
472 uint32_t st_ack_fwd; /* host byte order */
473 uint32_t st_seq_fwd; /* host byte order */
474 uint32_t st_ack_rev; /* host byte order */
475 uint32_t st_seq_rev; /* host byte order */
477 uint16_t st_flags; /* IPFW_STATE_F_ */
478 uint16_t st_type; /* KEEP_STATE/LIMIT/RDR */
479 struct ipfw_track *st_track;
481 LIST_ENTRY(ipfw_state) st_trklink;
482 TAILQ_ENTRY(ipfw_state) st_link;
485 #define st_addrs st_key.addr_u.value
486 #define st_ports st_key.port_u.value
487 #define st_proto st_key.proto
488 #define st_swap st_key.swap
490 #define IPFW_STATE_F_ACKFWD 0x0001
491 #define IPFW_STATE_F_SEQFWD 0x0002
492 #define IPFW_STATE_F_ACKREV 0x0004
493 #define IPFW_STATE_F_SEQREV 0x0008
494 #define IPFW_STATE_F_XLATSRC 0x0010
495 #define IPFW_STATE_F_XLATSLAVE 0x0020
496 #define IPFW_STATE_F_LINKED 0x0040
498 #define IPFW_STATE_SCANSKIP(s) ((s)->st_type == O_ANCHOR || \
499 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
501 /* Expired or being deleted. */
502 #define IPFW_STATE_ISDEAD(s) (TIME_LEQ((s)->st_expire, time_uptime) || \
503 IPFW_XLAT_INVALID((s)))
505 TAILQ_HEAD(ipfw_state_list, ipfw_state);
506 RB_HEAD(ipfw_state_tree, ipfw_state);
508 struct ipfw_xlat {
509 struct ipfw_state xlat_st; /* MUST be the first field */
510 uint32_t xlat_addr; /* network byte order */
511 uint16_t xlat_port; /* network byte order */
512 uint16_t xlat_dir; /* MATCH_ */
513 struct ifnet *xlat_ifp; /* matching ifnet */
514 struct ipfw_xlat *xlat_pair; /* paired state */
515 int xlat_pcpu; /* paired cpu */
516 volatile int xlat_invalid; /* invalid, but not dtor yet */
517 volatile uint64_t xlat_crefs; /* cross references */
518 struct netmsg_base xlat_freenm; /* for remote free */
521 #define xlat_type xlat_st.st_type
522 #define xlat_flags xlat_st.st_flags
523 #define xlat_rule xlat_st.st_rule
524 #define xlat_bcnt xlat_st.st_bcnt
525 #define xlat_pcnt xlat_st.st_pcnt
527 struct ipfw_tblent {
528 struct radix_node te_nodes[2];
529 struct sockaddr_in te_key;
530 u_long te_use;
531 time_t te_lastuse;
532 struct ipfw_tblent *te_sibling;
533 volatile int te_expired;
536 struct ipfw_context {
537 struct ip_fw *ipfw_layer3_chain; /* rules for layer3 */
538 struct ip_fw *ipfw_default_rule; /* default rule */
539 uint64_t ipfw_norule_counter; /* ipfw_log(NULL) stat*/
542 * ipfw_set_disable contains one bit per set value (0..31).
543 * If the bit is set, all rules with the corresponding set
544 * are disabled. Set IPDW_DEFAULT_SET is reserved for the
545 * default rule and CANNOT be disabled.
547 uint32_t ipfw_set_disable;
549 uint8_t ipfw_flags; /* IPFW_FLAG_ */
551 struct ip_fw *ipfw_cont_rule;
552 struct ipfw_xlat *ipfw_cont_xlat;
554 struct ipfw_state_tree ipfw_state_tree;
555 struct ipfw_state_list ipfw_state_list;
556 int ipfw_state_loosecnt;
557 int ipfw_state_cnt;
559 union {
560 struct ipfw_state state;
561 struct ipfw_track track;
562 struct ipfw_trkcnt trkcnt;
563 } ipfw_tmpkey;
565 struct ipfw_track_tree ipfw_track_tree;
566 struct ipfw_track_list ipfw_track_list;
567 struct ipfw_trkcnt *ipfw_trkcnt_spare;
569 struct callout ipfw_stateto_ch;
570 time_t ipfw_state_lastexp;
571 struct netmsg_base ipfw_stateexp_nm;
572 struct netmsg_base ipfw_stateexp_more;
573 struct ipfw_state ipfw_stateexp_anch;
575 struct callout ipfw_trackto_ch;
576 time_t ipfw_track_lastexp;
577 struct netmsg_base ipfw_trackexp_nm;
578 struct netmsg_base ipfw_trackexp_more;
579 struct ipfw_track ipfw_trackexp_anch;
581 struct callout ipfw_keepalive_ch;
582 struct netmsg_base ipfw_keepalive_nm;
583 struct netmsg_base ipfw_keepalive_more;
584 struct ipfw_state ipfw_keepalive_anch;
586 struct callout ipfw_xlatreap_ch;
587 struct netmsg_base ipfw_xlatreap_nm;
588 struct ipfw_state_list ipfw_xlatreap;
591 * Statistics
593 u_long ipfw_sts_reap;
594 u_long ipfw_sts_reapfailed;
595 u_long ipfw_sts_overflow;
596 u_long ipfw_sts_nomem;
597 u_long ipfw_sts_tcprecycled;
599 u_long ipfw_tks_nomem;
600 u_long ipfw_tks_reap;
601 u_long ipfw_tks_reapfailed;
602 u_long ipfw_tks_overflow;
603 u_long ipfw_tks_cntnomem;
605 u_long ipfw_frags;
606 u_long ipfw_defraged;
607 u_long ipfw_defrag_remote;
609 u_long ipfw_xlated;
610 u_long ipfw_xlate_split;
611 u_long ipfw_xlate_conflicts;
612 u_long ipfw_xlate_cresolved;
614 /* Last field */
615 struct radix_node_head *ipfw_tables[];
618 #define IPFW_FLAG_KEEPALIVE 0x01
619 #define IPFW_FLAG_STATEEXP 0x02
620 #define IPFW_FLAG_TRACKEXP 0x04
621 #define IPFW_FLAG_STATEREAP 0x08
622 #define IPFW_FLAG_TRACKREAP 0x10
624 #define ipfw_state_tmpkey ipfw_tmpkey.state
625 #define ipfw_track_tmpkey ipfw_tmpkey.track
626 #define ipfw_trkcnt_tmpkey ipfw_tmpkey.trkcnt
628 struct ipfw_global {
629 int ipfw_state_loosecnt; /* cache aligned */
630 time_t ipfw_state_globexp __cachealign;
632 struct lwkt_token ipfw_trkcnt_token __cachealign;
633 struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
634 int ipfw_trkcnt_cnt;
635 time_t ipfw_track_globexp;
637 /* Accessed in netisr0. */
638 struct ip_fw *ipfw_crossref_free __cachealign;
639 struct callout ipfw_crossref_ch;
640 struct netmsg_base ipfw_crossref_nm;
642 #ifdef KLD_MODULE
644 * Module can not be unloaded, if there are references to
645 * certains rules of ipfw(4), e.g. dummynet(4)
647 int ipfw_refcnt __cachealign;
648 #endif
649 } __cachealign;
651 static struct ipfw_context *ipfw_ctx[MAXCPU];
653 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
656 * Following two global variables are accessed and updated only
657 * in netisr0.
659 static uint32_t static_count; /* # of static rules */
660 static uint32_t static_ioc_len; /* bytes of static rules */
663 * If 1, then ipfw static rules are being flushed,
664 * ipfw_chk() will skip to the default rule.
666 static int ipfw_flushing;
668 static int fw_verbose;
669 static int verbose_limit;
671 static int fw_debug;
672 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
674 static int ipfw_table_max = IPFW_TABLE_MAX_DEF;
676 static int ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
677 static int ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
679 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
681 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
682 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
683 "Firewall statistics");
685 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
686 &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
687 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
688 &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
689 "Rule number autincrement step");
690 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
691 &fw_one_pass, 0,
692 "Only do a single pass through ipfw when using dummynet(4)");
693 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
694 &fw_debug, 0, "Enable printing of debug ip_fw statements");
695 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
696 &fw_verbose, 0, "Log matches to ipfw rules");
697 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
698 &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
699 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
700 &ipfw_table_max, 0, "Max # of tables");
702 static int ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
703 static int ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
704 static int ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
705 static int ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
706 static int ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
707 static int ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
710 * Timeouts for various events in handing states.
712 * NOTE:
713 * 1 == 0~1 second.
714 * 2 == 1~2 second(s).
716 * We use 2 seconds for FIN lifetime, so that the states will not be
717 * ripped prematurely.
719 static uint32_t dyn_ack_lifetime = 300;
720 static uint32_t dyn_syn_lifetime = 20;
721 static uint32_t dyn_finwait_lifetime = 20;
722 static uint32_t dyn_fin_lifetime = 2;
723 static uint32_t dyn_rst_lifetime = 2;
724 static uint32_t dyn_udp_lifetime = 10;
725 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
728 * Keepalives are sent if dyn_keepalive is set. They are sent every
729 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
730 * seconds of lifetime of a rule.
732 static uint32_t dyn_keepalive_interval = 20;
733 static uint32_t dyn_keepalive_period = 5;
734 static uint32_t dyn_keepalive = 1; /* do send keepalives */
736 static struct ipfw_global ipfw_gd;
737 static int ipfw_state_loosecnt_updthr;
738 static int ipfw_state_max = 4096; /* max # of states */
739 static int ipfw_track_max = 4096; /* max # of tracks */
741 static int ipfw_state_headroom; /* setup at module load time */
742 static int ipfw_state_reap_min = 8;
743 static int ipfw_state_expire_max = 32;
744 static int ipfw_state_scan_max = 256;
745 static int ipfw_keepalive_max = 8;
746 static int ipfw_track_reap_max = 4;
747 static int ipfw_track_expire_max = 16;
748 static int ipfw_track_scan_max = 128;
750 static eventhandler_tag ipfw_ifaddr_event;
752 /* Compat */
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
754 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
755 "Number of states and tracks");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
757 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
758 "Max number of states and tracks");
760 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
761 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
762 "Number of states");
763 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
764 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
765 "Max number of states");
766 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
767 &ipfw_state_headroom, 0, "headroom for state reap");
768 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
769 &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
770 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
771 &ipfw_track_max, 0, "Max number of tracks");
772 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
773 &static_count, 0, "Number of static rules");
774 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
775 &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
776 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
777 &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
778 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
779 &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
780 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
781 &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
782 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
783 &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
784 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
785 &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
786 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
787 &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
788 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
789 &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
790 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
791 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
792 "I", "# of states to scan for each expire iteration");
793 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
794 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
795 "I", "# of states to expire for each expire iteration");
796 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
797 CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
798 "I", "# of states to expire for each expire iteration");
799 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
800 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
801 "I", "# of states to reap for state shortage");
802 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
803 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
804 "I", "# of tracks to scan for each expire iteration");
805 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
806 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
807 "I", "# of tracks to expire for each expire iteration");
808 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
809 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
810 "I", "# of tracks to reap for track shortage");
812 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
813 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
814 __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
815 "LU", "# of state reaps due to states shortage");
816 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
817 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
818 __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
819 "LU", "# of state reap failure");
820 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
821 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
822 __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
823 "LU", "# of state overflow");
824 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
825 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
826 __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
827 "LU", "# of state allocation failure");
828 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
829 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
830 __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
831 "LU", "# of state deleted due to fast TCP port recycling");
833 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
834 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
835 __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
836 "LU", "# of track allocation failure");
837 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
838 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
839 __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
840 "LU", "# of track reap due to tracks shortage");
841 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
842 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
843 __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
844 "LU", "# of track reap failure");
845 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
846 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
847 __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
848 "LU", "# of track overflow");
849 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
850 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
851 __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
852 "LU", "# of track counter allocation failure");
853 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
854 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
855 __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
856 "LU", "# of IP fragements defraged");
857 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
858 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
859 __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
860 "LU", "# of IP packets after defrag");
861 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
862 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
863 __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
864 "LU", "# of IP packets after defrag dispatched to remote cpus");
865 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
866 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
867 __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
868 "LU", "# address/port translations");
869 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
870 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
871 __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
872 "LU", "# address/port translations split between different cpus");
873 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
874 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
875 __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
876 "LU", "# address/port translations conflicts on remote cpu");
877 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
878 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
879 __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
880 "LU", "# address/port translations conflicts resolved on remote cpu");
882 static int ipfw_state_cmp(struct ipfw_state *,
883 struct ipfw_state *);
884 static int ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
885 struct ipfw_trkcnt *);
886 static int ipfw_track_cmp(struct ipfw_track *,
887 struct ipfw_track *);
889 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
890 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
892 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
893 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
895 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
896 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
898 static int ipfw_chk(struct ip_fw_args *);
899 static void ipfw_track_expire_ipifunc(void *);
900 static void ipfw_state_expire_ipifunc(void *);
901 static void ipfw_keepalive(void *);
902 static int ipfw_state_expire_start(struct ipfw_context *,
903 int, int);
904 static void ipfw_crossref_timeo(void *);
905 static void ipfw_state_remove(struct ipfw_context *,
906 struct ipfw_state *);
907 static void ipfw_xlat_reap_timeo(void *);
908 static void ipfw_defrag_redispatch(struct mbuf *, int,
909 struct ip_fw *);
911 #define IPFW_TRKCNT_TOKGET lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
912 #define IPFW_TRKCNT_TOKREL lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKINIT \
914 lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
916 static void
917 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
918 const struct sockaddr *netmask)
920 const u_char *cp1 = (const u_char *)src;
921 u_char *cp2 = (u_char *)dst;
922 const u_char *cp3 = (const u_char *)netmask;
923 u_char *cplim = cp2 + *cp3;
924 u_char *cplim2 = cp2 + *cp1;
926 *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
927 cp3 += 2;
928 if (cplim > cplim2)
929 cplim = cplim2;
930 while (cp2 < cplim)
931 *cp2++ = *cp1++ & *cp3++;
932 if (cp2 < cplim2)
933 bzero(cp2, cplim2 - cp2);
936 static __inline uint16_t
937 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
939 uint32_t l;
941 if (udp && !cksum)
942 return (0x0000);
943 l = cksum + old - new;
944 l = (l >> 16) + (l & 65535);
945 l = l & 65535;
946 if (udp && !l)
947 return (0xFFFF);
948 return (l);
951 static __inline void
952 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
953 in_addr_t daddr, uint16_t dport, uint8_t proto)
956 key->proto = proto;
957 key->swap = 0;
959 if (saddr < daddr) {
960 key->addr_u.addrs.addr1 = daddr;
961 key->addr_u.addrs.addr2 = saddr;
962 key->swap |= IPFW_KEY_SWAP_ADDRS;
963 } else {
964 key->addr_u.addrs.addr1 = saddr;
965 key->addr_u.addrs.addr2 = daddr;
968 if (sport < dport) {
969 key->port_u.ports.port1 = dport;
970 key->port_u.ports.port2 = sport;
971 key->swap |= IPFW_KEY_SWAP_PORTS;
972 } else {
973 key->port_u.ports.port1 = sport;
974 key->port_u.ports.port2 = dport;
977 if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
978 key->swap |= IPFW_KEY_SWAP_PORTS;
979 if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
980 key->swap |= IPFW_KEY_SWAP_ADDRS;
983 static __inline void
984 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
985 in_addr_t *daddr, uint16_t *dport)
988 if (key->swap & IPFW_KEY_SWAP_ADDRS) {
989 *saddr = key->addr_u.addrs.addr2;
990 *daddr = key->addr_u.addrs.addr1;
991 } else {
992 *saddr = key->addr_u.addrs.addr1;
993 *daddr = key->addr_u.addrs.addr2;
996 if (key->swap & IPFW_KEY_SWAP_PORTS) {
997 *sport = key->port_u.ports.port2;
998 *dport = key->port_u.ports.port1;
999 } else {
1000 *sport = key->port_u.ports.port1;
1001 *dport = key->port_u.ports.port2;
1005 static int
1006 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1009 if (s1->st_proto > s2->st_proto)
1010 return (1);
1011 if (s1->st_proto < s2->st_proto)
1012 return (-1);
1014 if (s1->st_addrs > s2->st_addrs)
1015 return (1);
1016 if (s1->st_addrs < s2->st_addrs)
1017 return (-1);
1019 if (s1->st_ports > s2->st_ports)
1020 return (1);
1021 if (s1->st_ports < s2->st_ports)
1022 return (-1);
1024 if (s1->st_swap == s2->st_swap ||
1025 (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1026 return (0);
1028 if (s1->st_swap > s2->st_swap)
1029 return (1);
1030 else
1031 return (-1);
1034 static int
1035 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1038 if (t1->tc_proto > t2->tc_proto)
1039 return (1);
1040 if (t1->tc_proto < t2->tc_proto)
1041 return (-1);
1043 if (t1->tc_addrs > t2->tc_addrs)
1044 return (1);
1045 if (t1->tc_addrs < t2->tc_addrs)
1046 return (-1);
1048 if (t1->tc_ports > t2->tc_ports)
1049 return (1);
1050 if (t1->tc_ports < t2->tc_ports)
1051 return (-1);
1053 if (t1->tc_ruleid > t2->tc_ruleid)
1054 return (1);
1055 if (t1->tc_ruleid < t2->tc_ruleid)
1056 return (-1);
1058 return (0);
1061 static int
1062 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1065 if (t1->t_proto > t2->t_proto)
1066 return (1);
1067 if (t1->t_proto < t2->t_proto)
1068 return (-1);
1070 if (t1->t_addrs > t2->t_addrs)
1071 return (1);
1072 if (t1->t_addrs < t2->t_addrs)
1073 return (-1);
1075 if (t1->t_ports > t2->t_ports)
1076 return (1);
1077 if (t1->t_ports < t2->t_ports)
1078 return (-1);
1080 if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1081 return (1);
1082 if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1083 return (-1);
1085 return (0);
1088 static __inline struct ipfw_state *
1089 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 struct ipfw_state *dup;
1093 KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1094 ("state %p was linked", s));
1095 dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1096 if (dup == NULL) {
1097 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1098 s->st_flags |= IPFW_STATE_F_LINKED;
1100 return (dup);
1103 static __inline void
1104 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1107 KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1108 ("state %p was not linked", s));
1109 RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1110 TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1111 s->st_flags &= ~IPFW_STATE_F_LINKED;
1114 static void
1115 ipfw_state_max_set(int state_max)
1118 ipfw_state_max = state_max;
1119 /* Allow 5% states over-allocation. */
1120 ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1123 static __inline int
1124 ipfw_state_cntcoll(void)
1126 int cpu, state_cnt = 0;
1128 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1129 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1130 return (state_cnt);
1133 static __inline int
1134 ipfw_state_cntsync(void)
1136 int state_cnt;
1138 state_cnt = ipfw_state_cntcoll();
1139 ipfw_gd.ipfw_state_loosecnt = state_cnt;
1140 return (state_cnt);
1143 static __inline int
1144 ipfw_free_rule(struct ip_fw *rule)
1146 KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1147 KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1148 rule->refcnt--;
1149 if (rule->refcnt == 0) {
1150 if (rule->cross_rules != NULL)
1151 kfree(rule->cross_rules, M_IPFW);
1152 kfree(rule, M_IPFW);
1153 return 1;
1155 return 0;
1158 static void
1159 ipfw_unref_rule(void *priv)
1161 ipfw_free_rule(priv);
1162 #ifdef KLD_MODULE
1163 KASSERT(ipfw_gd.ipfw_refcnt > 0,
1164 ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1165 atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1166 #endif
1169 static __inline void
1170 ipfw_ref_rule(struct ip_fw *rule)
1172 KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1173 #ifdef KLD_MODULE
1174 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1175 #endif
1176 rule->refcnt++;
1180 * This macro maps an ip pointer into a layer3 header pointer of type T
1182 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184 static __inline int
1185 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 int type = L3HDR(struct icmp,ip)->icmp_type;
1188 int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1189 int idx = type / 32;
1191 if (idx >= idx_max)
1192 return (0);
1193 return (cmd->d[idx] & (1 << (type % 32)));
1196 static __inline int
1197 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1199 int code = L3HDR(struct icmp,ip)->icmp_code;
1200 int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1201 int idx = code / 32;
1203 if (idx >= idx_max)
1204 return (0);
1205 return (cmd->d[idx] & (1 << (code % 32)));
1208 #define TT ((1 << ICMP_ECHO) | \
1209 (1 << ICMP_ROUTERSOLICIT) | \
1210 (1 << ICMP_TSTAMP) | \
1211 (1 << ICMP_IREQ) | \
1212 (1 << ICMP_MASKREQ))
1214 static int
1215 is_icmp_query(struct ip *ip)
1217 int type = L3HDR(struct icmp, ip)->icmp_type;
1219 return (type < 32 && (TT & (1 << type)));
1222 #undef TT
1225 * The following checks use two arrays of 8 or 16 bits to store the
1226 * bits that we want set or clear, respectively. They are in the
1227 * low and high half of cmd->arg1 or cmd->d[0].
1229 * We scan options and store the bits we find set. We succeed if
1231 * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1233 * The code is sometimes optimized not to store additional variables.
1235 static int
1236 flags_match(ipfw_insn *cmd, uint8_t bits)
1238 u_char want_clear;
1239 bits = ~bits;
1241 if (((cmd->arg1 & 0xff) & bits) != 0)
1242 return 0; /* some bits we want set were clear */
1244 want_clear = (cmd->arg1 >> 8) & 0xff;
1245 if ((want_clear & bits) != want_clear)
1246 return 0; /* some bits we want clear were set */
1247 return 1;
1250 static int
1251 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1253 int optlen, bits = 0;
1254 u_char *cp = (u_char *)(ip + 1);
1255 int x = (ip->ip_hl << 2) - sizeof(struct ip);
1257 for (; x > 0; x -= optlen, cp += optlen) {
1258 int opt = cp[IPOPT_OPTVAL];
1260 if (opt == IPOPT_EOL)
1261 break;
1263 if (opt == IPOPT_NOP) {
1264 optlen = 1;
1265 } else {
1266 optlen = cp[IPOPT_OLEN];
1267 if (optlen <= 0 || optlen > x)
1268 return 0; /* invalid or truncated */
1271 switch (opt) {
1272 case IPOPT_LSRR:
1273 bits |= IP_FW_IPOPT_LSRR;
1274 break;
1276 case IPOPT_SSRR:
1277 bits |= IP_FW_IPOPT_SSRR;
1278 break;
1280 case IPOPT_RR:
1281 bits |= IP_FW_IPOPT_RR;
1282 break;
1284 case IPOPT_TS:
1285 bits |= IP_FW_IPOPT_TS;
1286 break;
1288 default:
1289 break;
1292 return (flags_match(cmd, bits));
1295 static int
1296 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1298 int optlen, bits = 0;
1299 struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1300 u_char *cp = (u_char *)(tcp + 1);
1301 int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1303 for (; x > 0; x -= optlen, cp += optlen) {
1304 int opt = cp[0];
1306 if (opt == TCPOPT_EOL)
1307 break;
1309 if (opt == TCPOPT_NOP) {
1310 optlen = 1;
1311 } else {
1312 optlen = cp[1];
1313 if (optlen <= 0)
1314 break;
1317 switch (opt) {
1318 case TCPOPT_MAXSEG:
1319 bits |= IP_FW_TCPOPT_MSS;
1320 break;
1322 case TCPOPT_WINDOW:
1323 bits |= IP_FW_TCPOPT_WINDOW;
1324 break;
1326 case TCPOPT_SACK_PERMITTED:
1327 case TCPOPT_SACK:
1328 bits |= IP_FW_TCPOPT_SACK;
1329 break;
1331 case TCPOPT_TIMESTAMP:
1332 bits |= IP_FW_TCPOPT_TS;
1333 break;
1335 case TCPOPT_CC:
1336 case TCPOPT_CCNEW:
1337 case TCPOPT_CCECHO:
1338 bits |= IP_FW_TCPOPT_CC;
1339 break;
1341 default:
1342 break;
1345 return (flags_match(cmd, bits));
1348 static int
1349 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1351 if (ifp == NULL) /* no iface with this packet, match fails */
1352 return 0;
1354 /* Check by name or by IP address */
1355 if (cmd->name[0] != '\0') { /* match by name */
1356 /* Check name */
1357 if (cmd->p.glob) {
1358 if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1359 return(1);
1360 } else {
1361 if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1362 return(1);
1364 } else {
1365 struct ifaddr_container *ifac;
1367 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1368 struct ifaddr *ia = ifac->ifa;
1370 if (ia->ifa_addr == NULL)
1371 continue;
1372 if (ia->ifa_addr->sa_family != AF_INET)
1373 continue;
1374 if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1375 (ia->ifa_addr))->sin_addr.s_addr)
1376 return(1); /* match */
1379 return(0); /* no match, fail ... */
1382 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1385 * We enter here when we have a rule with O_LOG.
1386 * XXX this function alone takes about 2Kbytes of code!
1388 static void
1389 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1390 struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1392 char *action;
1393 int limit_reached = 0;
1394 char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1396 fragment[0] = '\0';
1397 proto[0] = '\0';
1399 if (f == NULL) { /* bogus pkt */
1400 if (verbose_limit != 0 &&
1401 ctx->ipfw_norule_counter >= verbose_limit)
1402 return;
1403 ctx->ipfw_norule_counter++;
1404 if (ctx->ipfw_norule_counter == verbose_limit)
1405 limit_reached = verbose_limit;
1406 action = "Refuse";
1407 } else { /* O_LOG is the first action, find the real one */
1408 ipfw_insn *cmd = ACTION_PTR(f);
1409 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1411 if (l->max_log != 0 && l->log_left == 0)
1412 return;
1413 l->log_left--;
1414 if (l->log_left == 0)
1415 limit_reached = l->max_log;
1416 cmd += F_LEN(cmd); /* point to first action */
1417 if (cmd->opcode == O_PROB)
1418 cmd += F_LEN(cmd);
1420 action = action2;
1421 switch (cmd->opcode) {
1422 case O_DENY:
1423 action = "Deny";
1424 break;
1426 case O_REJECT:
1427 if (cmd->arg1==ICMP_REJECT_RST) {
1428 action = "Reset";
1429 } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1430 action = "Reject";
1431 } else {
1432 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1433 cmd->arg1);
1435 break;
1437 case O_ACCEPT:
1438 action = "Accept";
1439 break;
1441 case O_COUNT:
1442 action = "Count";
1443 break;
1445 case O_DIVERT:
1446 ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1447 break;
1449 case O_TEE:
1450 ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1451 break;
1453 case O_SKIPTO:
1454 ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1455 break;
1457 case O_PIPE:
1458 ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1459 break;
1461 case O_QUEUE:
1462 ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1463 break;
1465 case O_FORWARD_IP:
1467 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1468 int len;
1470 len = ksnprintf(SNPARGS(action2, 0),
1471 "Forward to %s",
1472 kinet_ntoa(sa->sa.sin_addr, abuf));
1473 if (sa->sa.sin_port) {
1474 ksnprintf(SNPARGS(action2, len), ":%d",
1475 sa->sa.sin_port);
1478 break;
1480 default:
1481 action = "UNKNOWN";
1482 break;
1486 if (hlen == 0) { /* non-ip */
1487 ksnprintf(SNPARGS(proto, 0), "MAC");
1488 } else {
1489 struct ip *ip = mtod(m, struct ip *);
1490 /* these three are all aliases to the same thing */
1491 struct icmp *const icmp = L3HDR(struct icmp, ip);
1492 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1493 struct udphdr *const udp = (struct udphdr *)icmp;
1495 int ip_off, offset, ip_len;
1496 int len;
1498 ip_off = ntohs(ip->ip_off);
1499 ip_len = ntohs(ip->ip_len);
1500 offset = ip_off & IP_OFFMASK;
1502 switch (ip->ip_p) {
1503 case IPPROTO_TCP:
1504 len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1505 kinet_ntoa(ip->ip_src, abuf));
1506 if (offset == 0) {
1507 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1508 ntohs(tcp->th_sport),
1509 kinet_ntoa(ip->ip_dst, abuf),
1510 ntohs(tcp->th_dport));
1511 } else {
1512 ksnprintf(SNPARGS(proto, len), " %s",
1513 kinet_ntoa(ip->ip_dst, abuf));
1515 break;
1517 case IPPROTO_UDP:
1518 len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1519 kinet_ntoa(ip->ip_src, abuf));
1520 if (offset == 0) {
1521 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1522 ntohs(udp->uh_sport),
1523 kinet_ntoa(ip->ip_dst, abuf),
1524 ntohs(udp->uh_dport));
1525 } else {
1526 ksnprintf(SNPARGS(proto, len), " %s",
1527 kinet_ntoa(ip->ip_dst, abuf));
1529 break;
1531 case IPPROTO_ICMP:
1532 if (offset == 0) {
1533 len = ksnprintf(SNPARGS(proto, 0),
1534 "ICMP:%u.%u ",
1535 icmp->icmp_type,
1536 icmp->icmp_code);
1537 } else {
1538 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1540 len += ksnprintf(SNPARGS(proto, len), "%s",
1541 kinet_ntoa(ip->ip_src, abuf));
1542 ksnprintf(SNPARGS(proto, len), " %s",
1543 kinet_ntoa(ip->ip_dst, abuf));
1544 break;
1546 default:
1547 len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1548 kinet_ntoa(ip->ip_src, abuf));
1549 ksnprintf(SNPARGS(proto, len), " %s",
1550 kinet_ntoa(ip->ip_dst, abuf));
1551 break;
1554 if (ip_off & (IP_MF | IP_OFFMASK)) {
1555 ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1556 ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1557 offset << 3, (ip_off & IP_MF) ? "+" : "");
1561 if (oif || m->m_pkthdr.rcvif) {
1562 log(LOG_SECURITY | LOG_INFO,
1563 "ipfw: %d %s %s %s via %s%s\n",
1564 f ? f->rulenum : -1,
1565 action, proto, oif ? "out" : "in",
1566 oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1567 fragment);
1568 } else {
1569 log(LOG_SECURITY | LOG_INFO,
1570 "ipfw: %d %s %s [no if info]%s\n",
1571 f ? f->rulenum : -1,
1572 action, proto, fragment);
1575 if (limit_reached) {
1576 log(LOG_SECURITY | LOG_NOTICE,
1577 "ipfw: limit %d reached on entry %d\n",
1578 limit_reached, f ? f->rulenum : -1);
1582 #undef SNPARGS
1584 static void
1585 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1587 struct ip_fw *rule = slave_x->xlat_rule;
1589 KKASSERT(rule->cpuid == mycpuid);
1591 /* No more cross references; free this pair now. */
1592 kfree(x, M_IPFW);
1593 kfree(slave_x, M_IPFW);
1595 /* See the comment in ipfw_ip_xlate_dispatch(). */
1596 rule->cross_refs--;
1599 static void
1600 ipfw_xlat_reap_dispatch(netmsg_t nm)
1602 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1603 struct ipfw_state *s, *ns;
1605 ASSERT_NETISR_NCPUS(mycpuid);
1607 crit_enter();
1608 /* Reply ASAP. */
1609 netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1610 crit_exit();
1612 /* TODO: limit scanning depth */
1613 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1614 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1615 struct ipfw_xlat *slave_x = x->xlat_pair;
1616 uint64_t crefs;
1618 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1619 if (crefs == 0) {
1620 TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1621 ipfw_xlat_reap(x, slave_x);
1624 if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1625 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1626 &ctx->ipfw_xlatreap_nm);
1630 static void
1631 ipfw_xlat_reap_timeo(void *xnm)
1633 struct netmsg_base *nm = xnm;
1635 KKASSERT(mycpuid < netisr_ncpus);
1637 crit_enter();
1638 if (nm->lmsg.ms_flags & MSGF_DONE)
1639 netisr_sendmsg_oncpu(nm);
1640 crit_exit();
1643 static void
1644 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1646 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1647 struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1648 struct ipfw_xlat *slave_x = x->xlat_pair;
1649 uint64_t crefs;
1651 ASSERT_NETISR_NCPUS(mycpuid);
1653 KKASSERT(slave_x != NULL);
1654 KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1656 KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1657 ("master xlat is still linked"));
1658 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1659 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1661 /* See the comment in ipfw_ip_xlate_dispatch(). */
1662 slave_x->xlat_crefs--;
1664 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1665 if (crefs == 0) {
1666 ipfw_xlat_reap(x, slave_x);
1667 return;
1670 if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1671 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1672 &ctx->ipfw_xlatreap_nm);
1676 * This pair is still referenced; defer its destruction.
1677 * YYY reuse st_link.
1679 TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1682 static __inline void
1683 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1686 x->xlat_invalid = 1;
1687 x->xlat_pair->xlat_invalid = 1;
1690 static void
1691 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1693 struct ipfw_xlat *x, *slave_x;
1694 struct netmsg_base *nm;
1696 KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1697 IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1698 KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1699 ("delete slave xlat"));
1701 KASSERT(ctx->ipfw_state_cnt > 0,
1702 ("invalid state count %d", ctx->ipfw_state_cnt));
1703 ctx->ipfw_state_cnt--;
1704 if (ctx->ipfw_state_loosecnt > 0)
1705 ctx->ipfw_state_loosecnt--;
1708 * Unhook this state.
1710 if (s->st_track != NULL) {
1711 struct ipfw_track *t = s->st_track;
1713 KASSERT(!LIST_EMPTY(&t->t_state_list),
1714 ("track state list is empty"));
1715 LIST_REMOVE(s, st_trklink);
1717 KASSERT(*t->t_count > 0,
1718 ("invalid track count %d", *t->t_count));
1719 atomic_subtract_int(t->t_count, 1);
1721 ipfw_state_unlink(ctx, s);
1724 * Free this state. Xlat requires special processing,
1725 * since xlat are paired state and they could be on
1726 * different cpus.
1729 if (!IPFW_ISXLAT(s->st_type)) {
1730 /* Not xlat; free now. */
1731 kfree(s, M_IPFW);
1732 /* Done! */
1733 return;
1735 x = (struct ipfw_xlat *)s;
1737 if (x->xlat_pair == NULL) {
1738 /* Not setup yet; free now. */
1739 kfree(x, M_IPFW);
1740 /* Done! */
1741 return;
1743 slave_x = x->xlat_pair;
1744 KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1746 if (x->xlat_pcpu == mycpuid) {
1748 * Paired states are on the same cpu; delete this
1749 * pair now.
1751 KKASSERT(x->xlat_crefs == 0);
1752 KKASSERT(slave_x->xlat_crefs == 0);
1753 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1754 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1755 kfree(x, M_IPFW);
1756 kfree(slave_x, M_IPFW);
1757 return;
1761 * Free the paired states on the cpu owning the slave xlat.
1765 * Mark the state pair invalid; completely deleting them
1766 * may take some time.
1768 ipfw_xlat_invalidate(x);
1770 nm = &x->xlat_freenm;
1771 netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1772 ipfw_xlat_free_dispatch);
1773 nm->lmsg.u.ms_resultp = x;
1775 /* See the comment in ipfw_xlate_redispatch(). */
1776 x->xlat_rule->cross_refs++;
1777 x->xlat_crefs++;
1779 netisr_sendmsg(nm, x->xlat_pcpu);
1782 static void
1783 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1786 if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1787 KKASSERT(IPFW_ISXLAT(s->st_type));
1788 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1789 ipfw_state_unlink(ctx, s);
1790 return;
1792 ipfw_state_del(ctx, s);
1795 static int
1796 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1798 struct ipfw_state *s, *anchor;
1799 int expired;
1801 if (reap_max < ipfw_state_reap_min)
1802 reap_max = ipfw_state_reap_min;
1804 if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1806 * Kick start state expiring. Ignore scan limit,
1807 * we are short of states.
1809 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1810 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1811 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1812 return (expired);
1816 * States are being expired.
1819 if (ctx->ipfw_state_cnt == 0)
1820 return (0);
1822 expired = 0;
1823 anchor = &ctx->ipfw_stateexp_anch;
1824 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1826 * Ignore scan limit; we are short of states.
1829 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1830 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1832 if (IPFW_STATE_SCANSKIP(s))
1833 continue;
1835 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1836 ipfw_state_del(ctx, s);
1837 if (++expired >= reap_max)
1838 break;
1839 if ((expired & 0xff) == 0 &&
1840 ipfw_state_cntcoll() + ipfw_state_headroom <=
1841 ipfw_state_max)
1842 break;
1846 * NOTE:
1847 * Leave the anchor on the list, even if the end of the list has
1848 * been reached. ipfw_state_expire_more_dispatch() will handle
1849 * the removal.
1851 return (expired);
1854 static void
1855 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1857 struct ipfw_state *s, *sn;
1859 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1860 if (IPFW_STATE_SCANSKIP(s))
1861 continue;
1862 if (rule != NULL && s->st_rule != rule)
1863 continue;
1864 ipfw_state_del(ctx, s);
1868 static void
1869 ipfw_state_expire_done(struct ipfw_context *ctx)
1872 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1873 ("stateexp is not in progress"));
1874 ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1875 callout_reset(&ctx->ipfw_stateto_ch, hz,
1876 ipfw_state_expire_ipifunc, NULL);
1879 static void
1880 ipfw_state_expire_more(struct ipfw_context *ctx)
1882 struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1884 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1885 ("stateexp is not in progress"));
1886 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1887 ("stateexp more did not finish"));
1888 netisr_sendmsg_oncpu(nm);
1891 static int
1892 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1893 int scan_max, int expire_max)
1895 struct ipfw_state *s;
1896 int scanned = 0, expired = 0;
1898 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1899 ("stateexp is not in progress"));
1901 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1902 if (scanned++ >= scan_max) {
1903 ipfw_state_expire_more(ctx);
1904 return (expired);
1907 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1908 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1910 if (IPFW_STATE_SCANSKIP(s))
1911 continue;
1913 if (IPFW_STATE_ISDEAD(s) ||
1914 ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1915 IPFW_STATE_TCPCLOSED(s))) {
1916 ipfw_state_del(ctx, s);
1917 if (++expired >= expire_max) {
1918 ipfw_state_expire_more(ctx);
1919 return (expired);
1921 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1922 (expired & 0xff) == 0 &&
1923 ipfw_state_cntcoll() + ipfw_state_headroom <=
1924 ipfw_state_max) {
1925 ipfw_state_expire_more(ctx);
1926 return (expired);
1930 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1931 ipfw_state_expire_done(ctx);
1932 return (expired);
1935 static void
1936 ipfw_state_expire_more_dispatch(netmsg_t nm)
1938 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1939 struct ipfw_state *anchor;
1941 ASSERT_NETISR_NCPUS(mycpuid);
1942 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1943 ("statexp is not in progress"));
1945 /* Reply ASAP */
1946 netisr_replymsg(&nm->base, 0);
1948 anchor = &ctx->ipfw_stateexp_anch;
1949 if (ctx->ipfw_state_cnt == 0) {
1950 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1951 ipfw_state_expire_done(ctx);
1952 return;
1954 ipfw_state_expire_loop(ctx, anchor,
1955 ipfw_state_scan_max, ipfw_state_expire_max);
1958 static int
1959 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1961 struct ipfw_state *anchor;
1963 KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1964 ("stateexp is in progress"));
1965 ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1967 if (ctx->ipfw_state_cnt == 0) {
1968 ipfw_state_expire_done(ctx);
1969 return (0);
1973 * Do not expire more than once per second, it is useless.
1975 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1976 ctx->ipfw_state_lastexp == time_uptime) {
1977 ipfw_state_expire_done(ctx);
1978 return (0);
1980 ctx->ipfw_state_lastexp = time_uptime;
1982 anchor = &ctx->ipfw_stateexp_anch;
1983 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1984 return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1987 static void
1988 ipfw_state_expire_dispatch(netmsg_t nm)
1990 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1992 ASSERT_NETISR_NCPUS(mycpuid);
1994 /* Reply ASAP */
1995 crit_enter();
1996 netisr_replymsg(&nm->base, 0);
1997 crit_exit();
1999 if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2000 /* Running; done. */
2001 return;
2003 ipfw_state_expire_start(ctx,
2004 ipfw_state_scan_max, ipfw_state_expire_max);
2007 static void
2008 ipfw_state_expire_ipifunc(void *dummy __unused)
2010 struct netmsg_base *msg;
2012 KKASSERT(mycpuid < netisr_ncpus);
2013 msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2015 crit_enter();
2016 if (msg->lmsg.ms_flags & MSGF_DONE)
2017 netisr_sendmsg_oncpu(msg);
2018 crit_exit();
2021 static boolean_t
2022 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2024 uint32_t seq = ntohl(tcp->th_seq);
2025 uint32_t ack = ntohl(tcp->th_ack);
2027 if (tcp->th_flags & TH_RST)
2028 return (TRUE);
2030 if (dir == MATCH_FORWARD) {
2031 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2032 s->st_flags |= IPFW_STATE_F_SEQFWD;
2033 s->st_seq_fwd = seq;
2034 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2035 s->st_seq_fwd = seq;
2036 } else {
2037 /* Out-of-sequence; done. */
2038 return (FALSE);
2040 if (tcp->th_flags & TH_ACK) {
2041 if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2042 s->st_flags |= IPFW_STATE_F_ACKFWD;
2043 s->st_ack_fwd = ack;
2044 } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2045 s->st_ack_fwd = ack;
2046 } else {
2047 /* Out-of-sequence; done. */
2048 return (FALSE);
2051 if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2052 (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2053 s->st_state |= (TH_ACK << 8);
2055 } else {
2056 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2057 s->st_flags |= IPFW_STATE_F_SEQREV;
2058 s->st_seq_rev = seq;
2059 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2060 s->st_seq_rev = seq;
2061 } else {
2062 /* Out-of-sequence; done. */
2063 return (FALSE);
2065 if (tcp->th_flags & TH_ACK) {
2066 if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2067 s->st_flags |= IPFW_STATE_F_ACKREV;
2068 s->st_ack_rev= ack;
2069 } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2070 s->st_ack_rev = ack;
2071 } else {
2072 /* Out-of-sequence; done. */
2073 return (FALSE);
2076 if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2077 s->st_ack_rev == s->st_seq_fwd + 1)
2078 s->st_state |= TH_ACK;
2081 return (TRUE);
2084 static void
2085 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2086 const struct tcphdr *tcp, struct ipfw_state *s)
2089 if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2090 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2092 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2093 return;
2095 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2096 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2097 case TH_SYN: /* opening */
2098 s->st_expire = time_uptime + dyn_syn_lifetime;
2099 break;
2101 case BOTH_SYN: /* move to established */
2102 case BOTH_SYN | TH_FIN: /* one side tries to close */
2103 case BOTH_SYN | (TH_FIN << 8):
2104 s->st_expire = time_uptime + dyn_ack_lifetime;
2105 break;
2107 case BOTH_SYN | BOTH_FIN: /* both sides closed */
2108 if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2109 /* And both FINs were ACKed. */
2110 s->st_expire = time_uptime + dyn_fin_lifetime;
2111 } else {
2112 s->st_expire = time_uptime +
2113 dyn_finwait_lifetime;
2115 break;
2117 default:
2118 #if 0
2120 * reset or some invalid combination, but can also
2121 * occur if we use keep-state the wrong way.
2123 if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2124 kprintf("invalid state: 0x%x\n", s->st_state);
2125 #endif
2126 s->st_expire = time_uptime + dyn_rst_lifetime;
2127 break;
2129 } else if (pkt->proto == IPPROTO_UDP) {
2130 s->st_expire = time_uptime + dyn_udp_lifetime;
2131 } else {
2132 /* other protocols */
2133 s->st_expire = time_uptime + dyn_short_lifetime;
2138 * Lookup a state.
2140 static struct ipfw_state *
2141 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2142 int *match_direction, const struct tcphdr *tcp)
2144 struct ipfw_state *key, *s;
2145 int dir = MATCH_NONE;
2147 key = &ctx->ipfw_state_tmpkey;
2148 ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2149 pkt->dst_ip, pkt->dst_port, pkt->proto);
2150 s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2151 if (s == NULL)
2152 goto done; /* not found. */
2153 if (IPFW_STATE_ISDEAD(s)) {
2154 ipfw_state_remove(ctx, s);
2155 s = NULL;
2156 goto done;
2158 if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2159 /* TCP ports recycling is too fast. */
2160 ctx->ipfw_sts_tcprecycled++;
2161 ipfw_state_remove(ctx, s);
2162 s = NULL;
2163 goto done;
2166 if (s->st_swap == key->st_swap) {
2167 dir = MATCH_FORWARD;
2168 } else {
2169 KASSERT((s->st_swap & key->st_swap) == 0,
2170 ("found mismatch state"));
2171 dir = MATCH_REVERSE;
2174 /* Update this state. */
2175 ipfw_state_update(pkt, dir, tcp, s);
2177 if (s->st_track != NULL) {
2178 /* This track has been used. */
2179 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2181 done:
2182 if (match_direction)
2183 *match_direction = dir;
2184 return (s);
2187 static struct ipfw_state *
2188 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2189 uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2191 struct ipfw_state *s;
2192 size_t sz;
2194 KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2195 ("invalid state type %u", type));
2197 sz = sizeof(struct ipfw_state);
2198 if (IPFW_ISXLAT(type))
2199 sz = sizeof(struct ipfw_xlat);
2201 s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2202 if (s == NULL) {
2203 ctx->ipfw_sts_nomem++;
2204 return (NULL);
2207 ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2208 id->dst_ip, id->dst_port, id->proto);
2210 s->st_rule = rule;
2211 s->st_type = type;
2212 if (IPFW_ISXLAT(type)) {
2213 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2215 x->xlat_dir = MATCH_NONE;
2216 x->xlat_pcpu = -1;
2220 * Update this state:
2221 * Set st_expire and st_state.
2223 ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2225 return (s);
2228 static struct ipfw_state *
2229 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2230 uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2231 const struct tcphdr *tcp)
2233 struct ipfw_state *s, *dup;
2235 s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2236 if (s == NULL)
2237 return (NULL);
2239 ctx->ipfw_state_cnt++;
2240 ctx->ipfw_state_loosecnt++;
2241 if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2242 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2243 ctx->ipfw_state_loosecnt = 0;
2246 dup = ipfw_state_link(ctx, s);
2247 if (dup != NULL)
2248 panic("ipfw: %u state exists %p", type, dup);
2250 if (t != NULL) {
2251 /* Keep the track referenced. */
2252 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2253 s->st_track = t;
2255 return (s);
2258 static boolean_t
2259 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2261 struct ipfw_trkcnt *trk;
2262 boolean_t trk_freed = FALSE;
2264 KASSERT(t->t_count != NULL, ("track anchor"));
2265 KASSERT(LIST_EMPTY(&t->t_state_list),
2266 ("invalid track is still referenced"));
2268 trk = t->t_trkcnt;
2269 KASSERT(trk != NULL, ("track has no trkcnt"));
2271 RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2272 TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2273 kfree(t, M_IPFW);
2276 * fdrop() style reference counting.
2277 * See kern/kern_descrip.c fdrop().
2279 for (;;) {
2280 int refs = trk->tc_refs;
2282 cpu_ccfence();
2283 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2284 if (refs == 1) {
2285 IPFW_TRKCNT_TOKGET;
2286 if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2287 KASSERT(trk->tc_count == 0,
2288 ("%d states reference this trkcnt",
2289 trk->tc_count));
2290 RB_REMOVE(ipfw_trkcnt_tree,
2291 &ipfw_gd.ipfw_trkcnt_tree, trk);
2293 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2294 ("invalid trkcnt cnt %d",
2295 ipfw_gd.ipfw_trkcnt_cnt));
2296 ipfw_gd.ipfw_trkcnt_cnt--;
2297 IPFW_TRKCNT_TOKREL;
2299 if (ctx->ipfw_trkcnt_spare == NULL)
2300 ctx->ipfw_trkcnt_spare = trk;
2301 else
2302 kfree(trk, M_IPFW);
2303 trk_freed = TRUE;
2304 break; /* done! */
2306 IPFW_TRKCNT_TOKREL;
2307 /* retry */
2308 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2309 break; /* done! */
2311 /* retry */
2313 return (trk_freed);
2316 static void
2317 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2319 struct ipfw_track *t, *tn;
2321 TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2322 if (t->t_count == NULL) /* anchor */
2323 continue;
2324 if (rule != NULL && t->t_rule != rule)
2325 continue;
2326 ipfw_track_free(ctx, t);
2330 static boolean_t
2331 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2332 boolean_t reap)
2334 struct ipfw_state *s, *sn;
2335 boolean_t ret = FALSE;
2337 KASSERT(t->t_count != NULL, ("track anchor"));
2339 if (LIST_EMPTY(&t->t_state_list))
2340 return (FALSE);
2343 * Do not expire more than once per second, it is useless.
2345 if (t->t_lastexp == time_uptime)
2346 return (FALSE);
2347 t->t_lastexp = time_uptime;
2349 LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2350 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2351 KASSERT(s->st_track == t,
2352 ("state track %p does not match %p",
2353 s->st_track, t));
2354 ipfw_state_del(ctx, s);
2355 ret = TRUE;
2358 return (ret);
2361 static __inline struct ipfw_trkcnt *
2362 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2364 struct ipfw_trkcnt *trk;
2366 if (ctx->ipfw_trkcnt_spare != NULL) {
2367 trk = ctx->ipfw_trkcnt_spare;
2368 ctx->ipfw_trkcnt_spare = NULL;
2369 } else {
2370 trk = kmalloc(sizeof(*trk), M_IPFW,
2371 M_INTWAIT | M_NULLOK | M_CACHEALIGN);
2373 return (trk);
2376 static void
2377 ipfw_track_expire_done(struct ipfw_context *ctx)
2380 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2381 ("trackexp is not in progress"));
2382 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2383 callout_reset(&ctx->ipfw_trackto_ch, hz,
2384 ipfw_track_expire_ipifunc, NULL);
2387 static void
2388 ipfw_track_expire_more(struct ipfw_context *ctx)
2390 struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2392 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2393 ("trackexp is not in progress"));
2394 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2395 ("trackexp more did not finish"));
2396 netisr_sendmsg_oncpu(nm);
2399 static int
2400 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2401 int scan_max, int expire_max)
2403 struct ipfw_track *t;
2404 int scanned = 0, expired = 0;
2405 boolean_t reap = FALSE;
2407 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2408 ("trackexp is not in progress"));
2410 if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2411 reap = TRUE;
2413 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2414 if (scanned++ >= scan_max) {
2415 ipfw_track_expire_more(ctx);
2416 return (expired);
2419 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2420 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2422 if (t->t_count == NULL) /* anchor */
2423 continue;
2425 ipfw_track_state_expire(ctx, t, reap);
2426 if (!LIST_EMPTY(&t->t_state_list)) {
2427 /* There are states referencing this track. */
2428 continue;
2431 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2432 /* Expired. */
2433 if (ipfw_track_free(ctx, t)) {
2434 if (++expired >= expire_max) {
2435 ipfw_track_expire_more(ctx);
2436 return (expired);
2441 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2442 ipfw_track_expire_done(ctx);
2443 return (expired);
2446 static int
2447 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2449 struct ipfw_track *anchor;
2451 KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2452 ("trackexp is in progress"));
2453 ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2455 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2456 ipfw_track_expire_done(ctx);
2457 return (0);
2461 * Do not expire more than once per second, it is useless.
2463 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2464 ctx->ipfw_track_lastexp == time_uptime) {
2465 ipfw_track_expire_done(ctx);
2466 return (0);
2468 ctx->ipfw_track_lastexp = time_uptime;
2470 anchor = &ctx->ipfw_trackexp_anch;
2471 TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2472 return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2475 static void
2476 ipfw_track_expire_more_dispatch(netmsg_t nm)
2478 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2479 struct ipfw_track *anchor;
2481 ASSERT_NETISR_NCPUS(mycpuid);
2482 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2483 ("trackexp is not in progress"));
2485 /* Reply ASAP */
2486 netisr_replymsg(&nm->base, 0);
2488 anchor = &ctx->ipfw_trackexp_anch;
2489 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2490 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2491 ipfw_track_expire_done(ctx);
2492 return;
2494 ipfw_track_expire_loop(ctx, anchor,
2495 ipfw_track_scan_max, ipfw_track_expire_max);
2498 static void
2499 ipfw_track_expire_dispatch(netmsg_t nm)
2501 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2503 ASSERT_NETISR_NCPUS(mycpuid);
2505 /* Reply ASAP */
2506 crit_enter();
2507 netisr_replymsg(&nm->base, 0);
2508 crit_exit();
2510 if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2511 /* Running; done. */
2512 return;
2514 ipfw_track_expire_start(ctx,
2515 ipfw_track_scan_max, ipfw_track_expire_max);
2518 static void
2519 ipfw_track_expire_ipifunc(void *dummy __unused)
2521 struct netmsg_base *msg;
2523 KKASSERT(mycpuid < netisr_ncpus);
2524 msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2526 crit_enter();
2527 if (msg->lmsg.ms_flags & MSGF_DONE)
2528 netisr_sendmsg_oncpu(msg);
2529 crit_exit();
2532 static int
2533 ipfw_track_reap(struct ipfw_context *ctx)
2535 struct ipfw_track *t, *anchor;
2536 int expired;
2538 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2540 * Kick start track expiring. Ignore scan limit,
2541 * we are short of tracks.
2543 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2544 expired = ipfw_track_expire_start(ctx, INT_MAX,
2545 ipfw_track_reap_max);
2546 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2547 return (expired);
2551 * Tracks are being expired.
2554 if (RB_EMPTY(&ctx->ipfw_track_tree))
2555 return (0);
2557 expired = 0;
2558 anchor = &ctx->ipfw_trackexp_anch;
2559 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2561 * Ignore scan limit; we are short of tracks.
2564 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2565 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2567 if (t->t_count == NULL) /* anchor */
2568 continue;
2570 ipfw_track_state_expire(ctx, t, TRUE);
2571 if (!LIST_EMPTY(&t->t_state_list)) {
2572 /* There are states referencing this track. */
2573 continue;
2576 if (ipfw_track_free(ctx, t)) {
2577 if (++expired >= ipfw_track_reap_max) {
2578 ipfw_track_expire_more(ctx);
2579 break;
2584 * NOTE:
2585 * Leave the anchor on the list, even if the end of the list has
2586 * been reached. ipfw_track_expire_more_dispatch() will handle
2587 * the removal.
2589 return (expired);
2592 static struct ipfw_track *
2593 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2594 uint16_t limit_mask, struct ip_fw *rule)
2596 struct ipfw_track *key, *t, *dup;
2597 struct ipfw_trkcnt *trk, *ret;
2598 boolean_t do_expire = FALSE;
2600 KASSERT(rule->track_ruleid != 0,
2601 ("rule %u has no track ruleid", rule->rulenum));
2603 key = &ctx->ipfw_track_tmpkey;
2604 key->t_proto = id->proto;
2605 key->t_addrs = 0;
2606 key->t_ports = 0;
2607 key->t_rule = rule;
2608 if (limit_mask & DYN_SRC_ADDR)
2609 key->t_saddr = id->src_ip;
2610 if (limit_mask & DYN_DST_ADDR)
2611 key->t_daddr = id->dst_ip;
2612 if (limit_mask & DYN_SRC_PORT)
2613 key->t_sport = id->src_port;
2614 if (limit_mask & DYN_DST_PORT)
2615 key->t_dport = id->dst_port;
2617 t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2618 if (t != NULL)
2619 goto done;
2621 t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2622 if (t == NULL) {
2623 ctx->ipfw_tks_nomem++;
2624 return (NULL);
2627 t->t_key = key->t_key;
2628 t->t_rule = rule;
2629 t->t_lastexp = 0;
2630 LIST_INIT(&t->t_state_list);
2632 if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2633 time_t globexp, uptime;
2635 trk = NULL;
2636 do_expire = TRUE;
2639 * Do not expire globally more than once per second,
2640 * it is useless.
2642 uptime = time_uptime;
2643 globexp = ipfw_gd.ipfw_track_globexp;
2644 if (globexp != uptime &&
2645 atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2646 globexp, uptime)) {
2647 int cpu;
2649 /* Expire tracks on other CPUs. */
2650 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2651 if (cpu == mycpuid)
2652 continue;
2653 lwkt_send_ipiq(globaldata_find(cpu),
2654 ipfw_track_expire_ipifunc, NULL);
2657 } else {
2658 trk = ipfw_trkcnt_alloc(ctx);
2660 if (trk == NULL) {
2661 struct ipfw_trkcnt *tkey;
2663 tkey = &ctx->ipfw_trkcnt_tmpkey;
2664 key = NULL; /* tkey overlaps key */
2666 tkey->tc_key = t->t_key;
2667 tkey->tc_ruleid = rule->track_ruleid;
2669 IPFW_TRKCNT_TOKGET;
2670 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2671 tkey);
2672 if (trk == NULL) {
2673 IPFW_TRKCNT_TOKREL;
2674 if (do_expire) {
2675 ctx->ipfw_tks_reap++;
2676 if (ipfw_track_reap(ctx) > 0) {
2677 if (ipfw_gd.ipfw_trkcnt_cnt <
2678 ipfw_track_max) {
2679 trk = ipfw_trkcnt_alloc(ctx);
2680 if (trk != NULL)
2681 goto install;
2682 ctx->ipfw_tks_cntnomem++;
2683 } else {
2684 ctx->ipfw_tks_overflow++;
2686 } else {
2687 ctx->ipfw_tks_reapfailed++;
2688 ctx->ipfw_tks_overflow++;
2690 } else {
2691 ctx->ipfw_tks_cntnomem++;
2693 kfree(t, M_IPFW);
2694 return (NULL);
2696 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2697 ("invalid trkcnt refs %d", trk->tc_refs));
2698 atomic_add_int(&trk->tc_refs, 1);
2699 IPFW_TRKCNT_TOKREL;
2700 } else {
2701 install:
2702 trk->tc_key = t->t_key;
2703 trk->tc_ruleid = rule->track_ruleid;
2704 trk->tc_refs = 0;
2705 trk->tc_count = 0;
2706 trk->tc_expire = 0;
2707 trk->tc_rulenum = rule->rulenum;
2709 IPFW_TRKCNT_TOKGET;
2710 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2711 trk);
2712 if (ret != NULL) {
2713 KASSERT(ret->tc_refs > 0 &&
2714 ret->tc_refs < netisr_ncpus,
2715 ("invalid trkcnt refs %d", ret->tc_refs));
2716 KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2717 ("trkcnt spare was installed"));
2718 ctx->ipfw_trkcnt_spare = trk;
2719 trk = ret;
2720 } else {
2721 ipfw_gd.ipfw_trkcnt_cnt++;
2723 atomic_add_int(&trk->tc_refs, 1);
2724 IPFW_TRKCNT_TOKREL;
2726 t->t_count = &trk->tc_count;
2727 t->t_trkcnt = trk;
2729 dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2730 if (dup != NULL)
2731 panic("ipfw: track exists");
2732 TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2733 done:
2734 t->t_expire = time_uptime + dyn_short_lifetime;
2735 return (t);
2739 * Install state for rule type cmd->o.opcode
2741 * Returns NULL if state is not installed because of errors or because
2742 * states limitations are enforced.
2744 static struct ipfw_state *
2745 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2746 ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2748 struct ipfw_state *s;
2749 struct ipfw_track *t;
2750 int count, diff;
2752 if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2753 (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2754 boolean_t overflow = TRUE;
2756 ctx->ipfw_sts_reap++;
2757 if (ipfw_state_reap(ctx, diff) == 0)
2758 ctx->ipfw_sts_reapfailed++;
2759 if (ipfw_state_cntsync() < ipfw_state_max)
2760 overflow = FALSE;
2762 if (overflow) {
2763 time_t globexp, uptime;
2764 int cpu;
2767 * Do not expire globally more than once per second,
2768 * it is useless.
2770 uptime = time_uptime;
2771 globexp = ipfw_gd.ipfw_state_globexp;
2772 if (globexp == uptime ||
2773 !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2774 globexp, uptime)) {
2775 ctx->ipfw_sts_overflow++;
2776 return (NULL);
2779 /* Expire states on other CPUs. */
2780 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2781 if (cpu == mycpuid)
2782 continue;
2783 lwkt_send_ipiq(globaldata_find(cpu),
2784 ipfw_state_expire_ipifunc, NULL);
2786 ctx->ipfw_sts_overflow++;
2787 return (NULL);
2791 switch (cmd->o.opcode) {
2792 case O_KEEP_STATE: /* bidir rule */
2793 case O_REDIRECT:
2794 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2795 tcp);
2796 if (s == NULL)
2797 return (NULL);
2798 break;
2800 case O_LIMIT: /* limit number of sessions */
2801 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2802 if (t == NULL)
2803 return (NULL);
2805 if (*t->t_count >= cmd->conn_limit) {
2806 if (!ipfw_track_state_expire(ctx, t, TRUE))
2807 return (NULL);
2809 for (;;) {
2810 count = *t->t_count;
2811 if (count >= cmd->conn_limit)
2812 return (NULL);
2813 if (atomic_cmpset_int(t->t_count, count, count + 1))
2814 break;
2817 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2818 if (s == NULL) {
2819 /* Undo damage. */
2820 atomic_subtract_int(t->t_count, 1);
2821 return (NULL);
2823 break;
2825 default:
2826 panic("unknown state type %u\n", cmd->o.opcode);
2829 if (s->st_type == O_REDIRECT) {
2830 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2831 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2833 x->xlat_addr = r->addr.s_addr;
2834 x->xlat_port = r->port;
2835 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2836 x->xlat_dir = MATCH_FORWARD;
2837 KKASSERT(x->xlat_ifp != NULL);
2839 return (s);
2842 static int
2843 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2844 const struct in_addr *in)
2846 struct radix_node_head *rnh;
2847 struct sockaddr_in sin;
2848 struct ipfw_tblent *te;
2850 KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2851 rnh = ctx->ipfw_tables[tableid];
2852 if (rnh == NULL)
2853 return (0); /* no match */
2855 memset(&sin, 0, sizeof(sin));
2856 sin.sin_family = AF_INET;
2857 sin.sin_len = sizeof(sin);
2858 sin.sin_addr = *in;
2860 te = (struct ipfw_tblent *)rnh->rnh_matchaddr(&sin, rnh);
2861 if (te == NULL)
2862 return (0); /* no match */
2864 te->te_use++;
2865 te->te_lastuse = time_second;
2866 return (1); /* match */
2870 * Transmit a TCP packet, containing either a RST or a keepalive.
2871 * When flags & TH_RST, we are sending a RST packet, because of a
2872 * "reset" action matched the packet.
2873 * Otherwise we are sending a keepalive, and flags & TH_
2875 * Only {src,dst}_{ip,port} of "id" are used.
2877 static void
2878 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2880 struct mbuf *m;
2881 struct ip *ip;
2882 struct tcphdr *tcp;
2883 struct route sro; /* fake route */
2885 MGETHDR(m, M_NOWAIT, MT_HEADER);
2886 if (m == NULL)
2887 return;
2888 m->m_pkthdr.rcvif = NULL;
2889 m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2890 m->m_data += max_linkhdr;
2892 ip = mtod(m, struct ip *);
2893 bzero(ip, m->m_len);
2894 tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2895 ip->ip_p = IPPROTO_TCP;
2896 tcp->th_off = 5;
2899 * Assume we are sending a RST (or a keepalive in the reverse
2900 * direction), swap src and destination addresses and ports.
2902 ip->ip_src.s_addr = htonl(id->dst_ip);
2903 ip->ip_dst.s_addr = htonl(id->src_ip);
2904 tcp->th_sport = htons(id->dst_port);
2905 tcp->th_dport = htons(id->src_port);
2906 if (flags & TH_RST) { /* we are sending a RST */
2907 if (flags & TH_ACK) {
2908 tcp->th_seq = htonl(ack);
2909 tcp->th_ack = htonl(0);
2910 tcp->th_flags = TH_RST;
2911 } else {
2912 if (flags & TH_SYN)
2913 seq++;
2914 tcp->th_seq = htonl(0);
2915 tcp->th_ack = htonl(seq);
2916 tcp->th_flags = TH_RST | TH_ACK;
2918 } else {
2920 * We are sending a keepalive. flags & TH_SYN determines
2921 * the direction, forward if set, reverse if clear.
2922 * NOTE: seq and ack are always assumed to be correct
2923 * as set by the caller. This may be confusing...
2925 if (flags & TH_SYN) {
2927 * we have to rewrite the correct addresses!
2929 ip->ip_dst.s_addr = htonl(id->dst_ip);
2930 ip->ip_src.s_addr = htonl(id->src_ip);
2931 tcp->th_dport = htons(id->dst_port);
2932 tcp->th_sport = htons(id->src_port);
2934 tcp->th_seq = htonl(seq);
2935 tcp->th_ack = htonl(ack);
2936 tcp->th_flags = TH_ACK;
2940 * set ip_len to the payload size so we can compute
2941 * the tcp checksum on the pseudoheader
2942 * XXX check this, could save a couple of words ?
2944 ip->ip_len = htons(sizeof(struct tcphdr));
2945 tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2948 * now fill fields left out earlier
2950 ip->ip_ttl = ip_defttl;
2951 ip->ip_len = htons(m->m_pkthdr.len);
2953 bzero(&sro, sizeof(sro));
2954 ip_rtaddr(ip->ip_dst, &sro);
2956 m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2957 ip_output(m, NULL, &sro, 0, NULL, NULL);
2958 if (sro.ro_rt)
2959 RTFREE(sro.ro_rt);
2963 * Send a reject message, consuming the mbuf passed as an argument.
2965 static void
2966 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2968 if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2969 /* IP header is always left in network order */
2970 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2971 } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2972 struct tcphdr *const tcp =
2973 L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2975 if ((tcp->th_flags & TH_RST) == 0) {
2976 send_pkt(&args->f_id, ntohl(tcp->th_seq),
2977 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2979 m_freem(args->m);
2980 } else {
2981 m_freem(args->m);
2983 args->m = NULL;
2987 * Given an ip_fw *, lookup_next_rule will return a pointer
2988 * to the next rule, which can be either the jump
2989 * target (for skipto instructions) or the next one in the list (in
2990 * all other cases including a missing jump target).
2991 * The result is also written in the "next_rule" field of the rule.
2992 * Backward jumps are not allowed, so start looking from the next
2993 * rule...
2995 * This never returns NULL -- in case we do not have an exact match,
2996 * the next rule is returned. When the ruleset is changed,
2997 * pointers are flushed so we are always correct.
2999 static struct ip_fw *
3000 lookup_next_rule(struct ip_fw *me)
3002 struct ip_fw *rule = NULL;
3003 ipfw_insn *cmd;
3005 /* look for action, in case it is a skipto */
3006 cmd = ACTION_PTR(me);
3007 if (cmd->opcode == O_LOG)
3008 cmd += F_LEN(cmd);
3009 if (cmd->opcode == O_SKIPTO) {
3010 for (rule = me->next; rule; rule = rule->next) {
3011 if (rule->rulenum >= cmd->arg1)
3012 break;
3015 if (rule == NULL) /* failure or not a skipto */
3016 rule = me->next;
3017 me->next_rule = rule;
3018 return rule;
3021 static int
3022 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3023 enum ipfw_opcodes opcode, uid_t uid)
3025 struct in_addr src_ip, dst_ip;
3026 struct inpcbinfo *pi;
3027 boolean_t wildcard;
3028 struct inpcb *pcb;
3030 if (fid->proto == IPPROTO_TCP) {
3031 wildcard = FALSE;
3032 pi = &tcbinfo[mycpuid];
3033 } else if (fid->proto == IPPROTO_UDP) {
3034 wildcard = TRUE;
3035 pi = &udbinfo[mycpuid];
3036 } else {
3037 return 0;
3041 * Values in 'fid' are in host byte order
3043 dst_ip.s_addr = htonl(fid->dst_ip);
3044 src_ip.s_addr = htonl(fid->src_ip);
3045 if (oif) {
3046 pcb = in_pcblookup_hash(pi,
3047 dst_ip, htons(fid->dst_port),
3048 src_ip, htons(fid->src_port),
3049 wildcard, oif);
3050 } else {
3051 pcb = in_pcblookup_hash(pi,
3052 src_ip, htons(fid->src_port),
3053 dst_ip, htons(fid->dst_port),
3054 wildcard, NULL);
3056 if (pcb == NULL || pcb->inp_socket == NULL)
3057 return 0;
3059 if (opcode == O_UID) {
3060 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3061 return !socheckuid(pcb->inp_socket, uid);
3062 #undef socheckuid
3063 } else {
3064 return groupmember(uid, pcb->inp_socket->so_cred);
3068 static int
3069 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3072 if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3073 struct ifaddr_container *ifac;
3074 struct ifnet *ifp;
3076 ifp = ifunit_netisr(cmd->ifname);
3077 if (ifp == NULL)
3078 return (0);
3080 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3081 struct ifaddr *ia = ifac->ifa;
3083 if (ia->ifa_addr == NULL)
3084 continue;
3085 if (ia->ifa_addr->sa_family != AF_INET)
3086 continue;
3088 cmd->mask.s_addr = INADDR_ANY;
3089 if (cmd->o.arg1 & IPFW_IFIP_NET) {
3090 cmd->mask = ((struct sockaddr_in *)
3091 ia->ifa_netmask)->sin_addr;
3093 if (cmd->mask.s_addr == INADDR_ANY)
3094 cmd->mask.s_addr = INADDR_BROADCAST;
3096 cmd->addr =
3097 ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3098 cmd->addr.s_addr &= cmd->mask.s_addr;
3100 cmd->o.arg1 |= IPFW_IFIP_VALID;
3101 break;
3103 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3104 return (0);
3106 return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3109 static void
3110 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3111 struct in_addr *old_addr, uint16_t *old_port)
3113 struct ip *ip = mtod(m, struct ip *);
3114 struct in_addr *addr;
3115 uint16_t *port, *csum, dlen = 0;
3116 uint8_t udp = 0;
3117 boolean_t pseudo = FALSE;
3119 if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3120 addr = &ip->ip_src;
3121 switch (ip->ip_p) {
3122 case IPPROTO_TCP:
3123 port = &L3HDR(struct tcphdr, ip)->th_sport;
3124 csum = &L3HDR(struct tcphdr, ip)->th_sum;
3125 break;
3126 case IPPROTO_UDP:
3127 port = &L3HDR(struct udphdr, ip)->uh_sport;
3128 csum = &L3HDR(struct udphdr, ip)->uh_sum;
3129 udp = 1;
3130 break;
3131 default:
3132 panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3134 } else {
3135 addr = &ip->ip_dst;
3136 switch (ip->ip_p) {
3137 case IPPROTO_TCP:
3138 port = &L3HDR(struct tcphdr, ip)->th_dport;
3139 csum = &L3HDR(struct tcphdr, ip)->th_sum;
3140 break;
3141 case IPPROTO_UDP:
3142 port = &L3HDR(struct udphdr, ip)->uh_dport;
3143 csum = &L3HDR(struct udphdr, ip)->uh_sum;
3144 udp = 1;
3145 break;
3146 default:
3147 panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3150 if (old_addr != NULL)
3151 *old_addr = *addr;
3152 if (old_port != NULL) {
3153 if (x->xlat_port != 0)
3154 *old_port = *port;
3155 else
3156 *old_port = 0;
3159 if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3160 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3161 dlen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
3162 pseudo = TRUE;
3165 if (!pseudo) {
3166 const uint16_t *oaddr, *naddr;
3168 oaddr = (const uint16_t *)&addr->s_addr;
3169 naddr = (const uint16_t *)&x->xlat_addr;
3171 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3172 oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3173 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3174 oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3176 addr->s_addr = x->xlat_addr;
3178 if (x->xlat_port != 0) {
3179 if (!pseudo) {
3180 *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3181 udp);
3183 *port = x->xlat_port;
3186 if (pseudo) {
3187 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3188 htons(dlen + ip->ip_p));
3192 static void
3193 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3195 struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3196 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3197 struct mbuf *m = nm->m;
3198 struct ipfw_xlat *x = nm->arg1;
3199 struct ip_fw *rule = x->xlat_rule;
3201 ASSERT_NETISR_NCPUS(mycpuid);
3202 KASSERT(rule->cpuid == mycpuid,
3203 ("rule does not belong to cpu%d", mycpuid));
3204 KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3205 ("mbuf does not have ipfw continue rule"));
3207 KASSERT(ctx->ipfw_cont_rule == NULL,
3208 ("pending ipfw continue rule"));
3209 KASSERT(ctx->ipfw_cont_xlat == NULL,
3210 ("pending ipfw continue xlat"));
3211 ctx->ipfw_cont_rule = rule;
3212 ctx->ipfw_cont_xlat = x;
3214 if (nm->arg2 == 0)
3215 ip_input(m);
3216 else
3217 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3219 /* May not be cleared, if ipfw was unload/disabled. */
3220 ctx->ipfw_cont_rule = NULL;
3221 ctx->ipfw_cont_xlat = NULL;
3224 * This state is no longer used; decrement its xlat_crefs,
3225 * so this state can be deleted.
3227 x->xlat_crefs--;
3229 * This rule is no longer used; decrement its cross_refs,
3230 * so this rule can be deleted.
3232 * NOTE:
3233 * Decrement cross_refs in the last step of this function,
3234 * so that the module could be unloaded safely.
3236 rule->cross_refs--;
3239 static void
3240 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3241 uint32_t flags)
3243 struct netmsg_genpkt *nm;
3245 KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3246 x->xlat_pcpu, cpuid));
3249 * Bump cross_refs to prevent this rule and its siblings
3250 * from being deleted, while this mbuf is inflight. The
3251 * cross_refs of the sibling rule on the target cpu will
3252 * be decremented, once this mbuf is going to be filtered
3253 * on the target cpu.
3255 x->xlat_rule->cross_refs++;
3257 * Bump xlat_crefs to prevent this state and its paired
3258 * state from being deleted, while this mbuf is inflight.
3259 * The xlat_crefs of the paired state on the target cpu
3260 * will be decremented, once this mbuf is going to be
3261 * filtered on the target cpu.
3263 x->xlat_crefs++;
3265 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3266 if (flags & IPFW_XLATE_INSERT)
3267 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3268 if (flags & IPFW_XLATE_FORWARD)
3269 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3272 * NOTE: We always leave ip_len and ip_off in network
3273 * order across all network layers.
3275 nm = &m->m_hdr.mh_genmsg;
3276 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3277 ipfw_ip_xlate_dispatch);
3278 nm->m = m;
3279 nm->arg1 = x->xlat_pair;
3280 nm->arg2 = 0;
3281 if (flags & IPFW_XLATE_OUTPUT)
3282 nm->arg2 = 1;
3283 netisr_sendmsg(&nm->base, cpuid);
3286 static struct mbuf *
3287 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3288 struct ip_fw_local *local, struct ip **ip0)
3290 struct ip *ip = mtod(m, struct ip *);
3291 struct tcphdr *tcp;
3292 struct udphdr *udp;
3295 * Collect parameters into local variables for faster matching.
3297 if (hlen == 0) { /* do not grab addresses for non-ip pkts */
3298 local->proto = args->f_id.proto = 0; /* mark f_id invalid */
3299 goto done;
3302 local->proto = args->f_id.proto = ip->ip_p;
3303 local->src_ip = ip->ip_src;
3304 local->dst_ip = ip->ip_dst;
3305 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3306 local->ip_len = ntohs(ip->ip_len);
3308 #define PULLUP_TO(len) \
3309 do { \
3310 if (m->m_len < (len)) { \
3311 args->m = m = m_pullup(m, (len)); \
3312 if (m == NULL) { \
3313 ip = NULL; \
3314 goto done; \
3316 ip = mtod(m, struct ip *); \
3318 } while (0)
3320 if (local->offset == 0) {
3321 switch (local->proto) {
3322 case IPPROTO_TCP:
3323 PULLUP_TO(hlen + sizeof(struct tcphdr));
3324 local->tcp = tcp = L3HDR(struct tcphdr, ip);
3325 local->dst_port = tcp->th_dport;
3326 local->src_port = tcp->th_sport;
3327 args->f_id.flags = tcp->th_flags;
3328 break;
3330 case IPPROTO_UDP:
3331 PULLUP_TO(hlen + sizeof(struct udphdr));
3332 udp = L3HDR(struct udphdr, ip);
3333 local->dst_port = udp->uh_dport;
3334 local->src_port = udp->uh_sport;
3335 break;
3337 case IPPROTO_ICMP:
3338 PULLUP_TO(hlen + 4); /* type, code and checksum. */
3339 args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3340 break;
3342 default:
3343 break;
3347 #undef PULLUP_TO
3349 args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3350 args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3351 args->f_id.src_port = local->src_port = ntohs(local->src_port);
3352 args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3353 done:
3354 *ip0 = ip;
3355 return (m);
3358 static struct mbuf *
3359 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3360 struct ip_fw_local *local, struct ip **ip0)
3362 m->m_flags &= ~M_HASH;
3363 ip_hashfn(&m, 0);
3364 args->m = m;
3365 if (m == NULL) {
3366 *ip0 = NULL;
3367 return (NULL);
3369 KASSERT(m->m_flags & M_HASH, ("no hash"));
3371 /* 'm' might be changed by ip_hashfn(). */
3372 return (ipfw_setup_local(m, hlen, args, local, ip0));
3376 * The main check routine for the firewall.
3378 * All arguments are in args so we can modify them and return them
3379 * back to the caller.
3381 * Parameters:
3383 * args->m (in/out) The packet; we set to NULL when/if we nuke it.
3384 * Starts with the IP header.
3385 * args->eh (in) Mac header if present, or NULL for layer3 packet.
3386 * args->oif Outgoing interface, or NULL if packet is incoming.
3387 * The incoming interface is in the mbuf. (in)
3389 * args->rule Pointer to the last matching rule (in/out)
3390 * args->f_id Addresses grabbed from the packet (out)
3392 * Return value:
3394 * If the packet was denied/rejected and has been dropped, *m is equal
3395 * to NULL upon return.
3397 * IP_FW_DENY the packet must be dropped.
3398 * IP_FW_PASS The packet is to be accepted and routed normally.
3399 * IP_FW_DIVERT Divert the packet to port (args->cookie)
3400 * IP_FW_TEE Tee the packet to port (args->cookie)
3401 * IP_FW_DUMMYNET Send the packet to pipe/queue (args->cookie)
3402 * IP_FW_CONTINUE Continue processing on another cpu.
3404 static int
3405 ipfw_chk(struct ip_fw_args *args)
3408 * Local variables hold state during the processing of a packet.
3410 * IMPORTANT NOTE: to speed up the processing of rules, there
3411 * are some assumption on the values of the variables, which
3412 * are documented here. Should you change them, please check
3413 * the implementation of the various instructions to make sure
3414 * that they still work.
3416 * args->eh The MAC header. It is non-null for a layer2
3417 * packet, it is NULL for a layer-3 packet.
3419 * m | args->m Pointer to the mbuf, as received from the caller.
3420 * It may change if ipfw_chk() does an m_pullup, or if it
3421 * consumes the packet because it calls send_reject().
3422 * XXX This has to change, so that ipfw_chk() never modifies
3423 * or consumes the buffer.
3424 * ip is simply an alias of the value of m, and it is kept
3425 * in sync with it (the packet is supposed to start with
3426 * the ip header).
3428 struct mbuf *m = args->m;
3429 struct ip *ip = mtod(m, struct ip *);
3432 * oif | args->oif If NULL, ipfw_chk has been called on the
3433 * inbound path (ether_input, ip_input).
3434 * If non-NULL, ipfw_chk has been called on the outbound path
3435 * (ether_output, ip_output).
3437 struct ifnet *oif = args->oif;
3439 struct ip_fw *f = NULL; /* matching rule */
3440 int retval = IP_FW_PASS;
3441 struct m_tag *mtag;
3442 struct divert_info *divinfo;
3443 struct ipfw_state *s;
3446 * hlen The length of the IPv4 header.
3447 * hlen >0 means we have an IPv4 packet.
3449 u_int hlen = 0; /* hlen >0 means we have an IP pkt */
3451 struct ip_fw_local lc;
3454 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3455 * MATCH_NONE when checked and not matched (dyn_f = NULL),
3456 * MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3458 int dyn_dir = MATCH_UNKNOWN;
3459 struct ip_fw *dyn_f = NULL;
3460 int cpuid = mycpuid;
3461 struct ipfw_context *ctx;
3463 ASSERT_NETISR_NCPUS(cpuid);
3464 ctx = ipfw_ctx[cpuid];
3466 if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3467 return IP_FW_PASS; /* accept */
3469 if (args->eh == NULL || /* layer 3 packet */
3470 (m->m_pkthdr.len >= sizeof(struct ip) &&
3471 ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3472 hlen = ip->ip_hl << 2;
3474 memset(&lc, 0, sizeof(lc));
3476 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3477 if (m == NULL)
3478 goto pullup_failed;
3480 if (args->rule) {
3482 * Packet has already been tagged. Look for the next rule
3483 * to restart processing.
3485 * If fw_one_pass != 0 then just accept it.
3486 * XXX should not happen here, but optimized out in
3487 * the caller.
3489 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3490 return IP_FW_PASS;
3491 args->flags &= ~IP_FWARG_F_CONT;
3493 /* This rule is being/has been flushed */
3494 if (ipfw_flushing)
3495 return IP_FW_DENY;
3497 KASSERT(args->rule->cpuid == cpuid,
3498 ("rule used on cpu%d", cpuid));
3500 /* This rule was deleted */
3501 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3502 return IP_FW_DENY;
3504 if (args->xlat != NULL) {
3505 struct ipfw_xlat *x = args->xlat;
3507 /* This xlat is being deleted. */
3508 if (x->xlat_invalid)
3509 return IP_FW_DENY;
3511 f = args->rule;
3513 dyn_f = f;
3514 dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3515 MATCH_FORWARD : MATCH_REVERSE;
3517 if (args->flags & IP_FWARG_F_XLATINS) {
3518 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3519 ("not slave %u state", x->xlat_type));
3520 s = ipfw_state_link(ctx, &x->xlat_st);
3521 if (s != NULL) {
3522 ctx->ipfw_xlate_conflicts++;
3523 if (IPFW_STATE_ISDEAD(s)) {
3524 ipfw_state_remove(ctx, s);
3525 s = ipfw_state_link(ctx,
3526 &x->xlat_st);
3528 if (s != NULL) {
3529 if (bootverbose) {
3530 kprintf("ipfw: "
3531 "slave %u state "
3532 "conflicts %u state\n",
3533 x->xlat_type,
3534 s->st_type);
3536 ipfw_xlat_invalidate(x);
3537 return IP_FW_DENY;
3539 ctx->ipfw_xlate_cresolved++;
3541 } else {
3542 ipfw_state_update(&args->f_id, dyn_dir,
3543 lc.tcp, &x->xlat_st);
3545 } else {
3546 /* TODO: setup dyn_f, dyn_dir */
3548 f = args->rule->next_rule;
3549 if (f == NULL)
3550 f = lookup_next_rule(args->rule);
3552 } else {
3554 * Find the starting rule. It can be either the first
3555 * one, or the one after divert_rule if asked so.
3557 int skipto;
3559 KKASSERT((args->flags &
3560 (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3561 KKASSERT(args->xlat == NULL);
3563 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3564 if (mtag != NULL) {
3565 divinfo = m_tag_data(mtag);
3566 skipto = divinfo->skipto;
3567 } else {
3568 skipto = 0;
3571 f = ctx->ipfw_layer3_chain;
3572 if (args->eh == NULL && skipto != 0) {
3573 /* No skipto during rule flushing */
3574 if (ipfw_flushing)
3575 return IP_FW_DENY;
3577 if (skipto >= IPFW_DEFAULT_RULE)
3578 return IP_FW_DENY; /* invalid */
3580 while (f && f->rulenum <= skipto)
3581 f = f->next;
3582 if (f == NULL) /* drop packet */
3583 return IP_FW_DENY;
3584 } else if (ipfw_flushing) {
3585 /* Rules are being flushed; skip to default rule */
3586 f = ctx->ipfw_default_rule;
3589 if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3590 m_tag_delete(m, mtag);
3593 * Now scan the rules, and parse microinstructions for each rule.
3595 for (; f; f = f->next) {
3596 int l, cmdlen;
3597 ipfw_insn *cmd;
3598 int skip_or; /* skip rest of OR block */
3600 again:
3601 if (ctx->ipfw_set_disable & (1 << f->set)) {
3602 args->xlat = NULL;
3603 continue;
3606 if (args->xlat != NULL) {
3607 args->xlat = NULL;
3608 l = f->cmd_len - f->act_ofs;
3609 cmd = ACTION_PTR(f);
3610 } else {
3611 l = f->cmd_len;
3612 cmd = f->cmd;
3615 skip_or = 0;
3616 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3617 int match;
3620 * check_body is a jump target used when we find a
3621 * CHECK_STATE, and need to jump to the body of
3622 * the target rule.
3624 check_body:
3625 cmdlen = F_LEN(cmd);
3627 * An OR block (insn_1 || .. || insn_n) has the
3628 * F_OR bit set in all but the last instruction.
3629 * The first match will set "skip_or", and cause
3630 * the following instructions to be skipped until
3631 * past the one with the F_OR bit clear.
3633 if (skip_or) { /* skip this instruction */
3634 if ((cmd->len & F_OR) == 0)
3635 skip_or = 0; /* next one is good */
3636 continue;
3638 match = 0; /* set to 1 if we succeed */
3640 switch (cmd->opcode) {
3642 * The first set of opcodes compares the packet's
3643 * fields with some pattern, setting 'match' if a
3644 * match is found. At the end of the loop there is
3645 * logic to deal with F_NOT and F_OR flags associated
3646 * with the opcode.
3648 case O_NOP:
3649 match = 1;
3650 break;
3652 case O_FORWARD_MAC:
3653 kprintf("ipfw: opcode %d unimplemented\n",
3654 cmd->opcode);
3655 break;
3657 case O_GID:
3658 case O_UID:
3660 * We only check offset == 0 && proto != 0,
3661 * as this ensures that we have an IPv4
3662 * packet with the ports info.
3664 if (lc.offset!=0)
3665 break;
3667 match = ipfw_match_uid(&args->f_id, oif,
3668 cmd->opcode,
3669 (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3670 break;
3672 case O_RECV:
3673 match = iface_match(m->m_pkthdr.rcvif,
3674 (ipfw_insn_if *)cmd);
3675 break;
3677 case O_XMIT:
3678 match = iface_match(oif, (ipfw_insn_if *)cmd);
3679 break;
3681 case O_VIA:
3682 match = iface_match(oif ? oif :
3683 m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3684 break;
3686 case O_MACADDR2:
3687 if (args->eh != NULL) { /* have MAC header */
3688 uint32_t *want = (uint32_t *)
3689 ((ipfw_insn_mac *)cmd)->addr;
3690 uint32_t *mask = (uint32_t *)
3691 ((ipfw_insn_mac *)cmd)->mask;
3692 uint32_t *hdr = (uint32_t *)args->eh;
3694 match =
3695 (want[0] == (hdr[0] & mask[0]) &&
3696 want[1] == (hdr[1] & mask[1]) &&
3697 want[2] == (hdr[2] & mask[2]));
3699 break;
3701 case O_MAC_TYPE:
3702 if (args->eh != NULL) {
3703 uint16_t t =
3704 ntohs(args->eh->ether_type);
3705 uint16_t *p =
3706 ((ipfw_insn_u16 *)cmd)->ports;
3707 int i;
3709 /* Special vlan handling */
3710 if (m->m_flags & M_VLANTAG)
3711 t = ETHERTYPE_VLAN;
3713 for (i = cmdlen - 1; !match && i > 0;
3714 i--, p += 2) {
3715 match =
3716 (t >= p[0] && t <= p[1]);
3719 break;
3721 case O_FRAG:
3722 match = (hlen > 0 && lc.offset != 0);
3723 break;
3725 case O_IPFRAG:
3726 if (hlen > 0) {
3727 uint16_t off;
3729 off = ntohs(ip->ip_off);
3730 if (off & (IP_MF | IP_OFFMASK))
3731 match = 1;
3733 break;
3735 case O_IN: /* "out" is "not in" */
3736 match = (oif == NULL);
3737 break;
3739 case O_LAYER2:
3740 match = (args->eh != NULL);
3741 break;
3743 case O_PROTO:
3745 * We do not allow an arg of 0 so the
3746 * check of "proto" only suffices.
3748 match = (lc.proto == cmd->arg1);
3749 break;
3751 case O_IP_SRC:
3752 match = (hlen > 0 &&
3753 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3754 lc.src_ip.s_addr);
3755 break;
3757 case O_IP_SRC_MASK:
3758 match = (hlen > 0 &&
3759 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3760 (lc.src_ip.s_addr &
3761 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3762 break;
3764 case O_IP_SRC_ME:
3765 if (hlen > 0) {
3766 struct ifnet *tif;
3768 tif = INADDR_TO_IFP(&lc.src_ip);
3769 match = (tif != NULL);
3771 break;
3773 case O_IP_SRC_TABLE:
3774 match = ipfw_table_lookup(ctx, cmd->arg1,
3775 &lc.src_ip);
3776 break;
3778 case O_IP_SRC_IFIP:
3779 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3780 &lc.src_ip);
3781 break;
3783 case O_IP_DST_SET:
3784 case O_IP_SRC_SET:
3785 if (hlen > 0) {
3786 uint32_t *d = (uint32_t *)(cmd + 1);
3787 uint32_t addr =
3788 cmd->opcode == O_IP_DST_SET ?
3789 args->f_id.dst_ip :
3790 args->f_id.src_ip;
3792 if (addr < d[0])
3793 break;
3794 addr -= d[0]; /* subtract base */
3795 match =
3796 (addr < cmd->arg1) &&
3797 (d[1 + (addr >> 5)] &
3798 (1 << (addr & 0x1f)));
3800 break;
3802 case O_IP_DST:
3803 match = (hlen > 0 &&
3804 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3805 lc.dst_ip.s_addr);
3806 break;
3808 case O_IP_DST_MASK:
3809 match = (hlen > 0) &&
3810 (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3811 (lc.dst_ip.s_addr &
3812 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3813 break;
3815 case O_IP_DST_ME:
3816 if (hlen > 0) {
3817 struct ifnet *tif;
3819 tif = INADDR_TO_IFP(&lc.dst_ip);
3820 match = (tif != NULL);
3822 break;
3824 case O_IP_DST_TABLE:
3825 match = ipfw_table_lookup(ctx, cmd->arg1,
3826 &lc.dst_ip);
3827 break;
3829 case O_IP_DST_IFIP:
3830 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3831 &lc.dst_ip);
3832 break;
3834 case O_IP_SRCPORT:
3835 case O_IP_DSTPORT:
3837 * offset == 0 && proto != 0 is enough
3838 * to guarantee that we have an IPv4
3839 * packet with port info.
3841 if ((lc.proto==IPPROTO_UDP ||
3842 lc.proto==IPPROTO_TCP)
3843 && lc.offset == 0) {
3844 uint16_t x =
3845 (cmd->opcode == O_IP_SRCPORT) ?
3846 lc.src_port : lc.dst_port;
3847 uint16_t *p =
3848 ((ipfw_insn_u16 *)cmd)->ports;
3849 int i;
3851 for (i = cmdlen - 1; !match && i > 0;
3852 i--, p += 2) {
3853 match =
3854 (x >= p[0] && x <= p[1]);
3857 break;
3859 case O_ICMPCODE:
3860 match = (lc.offset == 0 &&
3861 lc.proto==IPPROTO_ICMP &&
3862 icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3863 break;
3865 case O_ICMPTYPE:
3866 match = (lc.offset == 0 &&
3867 lc.proto==IPPROTO_ICMP &&
3868 icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3869 break;
3871 case O_IPOPT:
3872 match = (hlen > 0 && ipopts_match(ip, cmd));
3873 break;
3875 case O_IPVER:
3876 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3877 break;
3879 case O_IPTTL:
3880 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3881 break;
3883 case O_IPID:
3884 match = (hlen > 0 &&
3885 cmd->arg1 == ntohs(ip->ip_id));
3886 break;
3888 case O_IPLEN:
3889 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3890 break;
3892 case O_IPPRECEDENCE:
3893 match = (hlen > 0 &&
3894 (cmd->arg1 == (ip->ip_tos & 0xe0)));
3895 break;
3897 case O_IPTOS:
3898 match = (hlen > 0 &&
3899 flags_match(cmd, ip->ip_tos));
3900 break;
3902 case O_TCPFLAGS:
3903 match = (lc.proto == IPPROTO_TCP &&
3904 lc.offset == 0 &&
3905 flags_match(cmd,
3906 L3HDR(struct tcphdr,ip)->th_flags));
3907 break;
3909 case O_TCPOPTS:
3910 match = (lc.proto == IPPROTO_TCP &&
3911 lc.offset == 0 && tcpopts_match(ip, cmd));
3912 break;
3914 case O_TCPSEQ:
3915 match = (lc.proto == IPPROTO_TCP &&
3916 lc.offset == 0 &&
3917 ((ipfw_insn_u32 *)cmd)->d[0] ==
3918 L3HDR(struct tcphdr,ip)->th_seq);
3919 break;
3921 case O_TCPACK:
3922 match = (lc.proto == IPPROTO_TCP &&
3923 lc.offset == 0 &&
3924 ((ipfw_insn_u32 *)cmd)->d[0] ==
3925 L3HDR(struct tcphdr,ip)->th_ack);
3926 break;
3928 case O_TCPWIN:
3929 match = (lc.proto == IPPROTO_TCP &&
3930 lc.offset == 0 &&
3931 cmd->arg1 ==
3932 L3HDR(struct tcphdr,ip)->th_win);
3933 break;
3935 case O_ESTAB:
3936 /* reject packets which have SYN only */
3937 /* XXX should i also check for TH_ACK ? */
3938 match = (lc.proto == IPPROTO_TCP &&
3939 lc.offset == 0 &&
3940 (L3HDR(struct tcphdr,ip)->th_flags &
3941 (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3942 break;
3944 case O_LOG:
3945 if (fw_verbose) {
3946 ipfw_log(ctx, f, hlen, args->eh, m,
3947 oif);
3949 match = 1;
3950 break;
3952 case O_PROB:
3953 match = (krandom() <
3954 ((ipfw_insn_u32 *)cmd)->d[0]);
3955 break;
3958 * The second set of opcodes represents 'actions',
3959 * i.e. the terminal part of a rule once the packet
3960 * matches all previous patterns.
3961 * Typically there is only one action for each rule,
3962 * and the opcode is stored at the end of the rule
3963 * (but there are exceptions -- see below).
3965 * In general, here we set retval and terminate the
3966 * outer loop (would be a 'break 3' in some language,
3967 * but we need to do a 'goto done').
3969 * Exceptions:
3970 * O_COUNT and O_SKIPTO actions:
3971 * instead of terminating, we jump to the next rule
3972 * ('goto next_rule', equivalent to a 'break 2'),
3973 * or to the SKIPTO target ('goto again' after
3974 * having set f, cmd and l), respectively.
3976 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
3977 * are not real 'actions', and are stored right
3978 * before the 'action' part of the rule.
3979 * These opcodes try to install an entry in the
3980 * state tables; if successful, we continue with
3981 * the next opcode (match=1; break;), otherwise
3982 * the packet must be dropped ('goto done' after
3983 * setting retval). If static rules are changed
3984 * during the state installation, the packet will
3985 * be dropped and rule's stats will not beupdated
3986 * ('return IP_FW_DENY').
3988 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3989 * cause a lookup of the state table, and a jump
3990 * to the 'action' part of the parent rule
3991 * ('goto check_body') if an entry is found, or
3992 * (CHECK_STATE only) a jump to the next rule if
3993 * the entry is not found ('goto next_rule').
3994 * The result of the lookup is cached to make
3995 * further instances of these opcodes are
3996 * effectively NOPs. If static rules are changed
3997 * during the state looking up, the packet will
3998 * be dropped and rule's stats will not be updated
3999 * ('return IP_FW_DENY').
4001 case O_REDIRECT:
4002 if (f->cross_rules == NULL) {
4004 * This rule was not completely setup;
4005 * move on to the next rule.
4007 goto next_rule;
4010 * Apply redirect only on input path and
4011 * only to non-fragment TCP segments or
4012 * UDP datagrams.
4014 * Does _not_ work with layer2 filtering.
4016 if (oif != NULL || args->eh != NULL ||
4017 (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4018 (lc.proto != IPPROTO_TCP &&
4019 lc.proto != IPPROTO_UDP))
4020 break;
4021 /* FALL THROUGH */
4022 case O_LIMIT:
4023 case O_KEEP_STATE:
4024 if (hlen == 0)
4025 break;
4026 s = ipfw_state_install(ctx, f,
4027 (ipfw_insn_limit *)cmd, args, lc.tcp);
4028 if (s == NULL) {
4029 retval = IP_FW_DENY;
4030 goto done; /* error/limit violation */
4032 s->st_pcnt++;
4033 s->st_bcnt += lc.ip_len;
4035 if (s->st_type == O_REDIRECT) {
4036 struct in_addr oaddr;
4037 uint16_t oport;
4038 struct ipfw_xlat *slave_x, *x;
4039 struct ipfw_state *dup;
4041 x = (struct ipfw_xlat *)s;
4042 ipfw_xlate(x, m, &oaddr, &oport);
4043 m = ipfw_rehashm(m, hlen, args, &lc,
4044 &ip);
4045 if (m == NULL) {
4046 ipfw_state_del(ctx, s);
4047 goto pullup_failed;
4050 cpuid = netisr_hashcpu(
4051 m->m_pkthdr.hash);
4053 slave_x = (struct ipfw_xlat *)
4054 ipfw_state_alloc(ctx, &args->f_id,
4055 O_REDIRECT, f->cross_rules[cpuid],
4056 lc.tcp);
4057 if (slave_x == NULL) {
4058 ipfw_state_del(ctx, s);
4059 retval = IP_FW_DENY;
4060 goto done;
4062 slave_x->xlat_addr = oaddr.s_addr;
4063 slave_x->xlat_port = oport;
4064 slave_x->xlat_dir = MATCH_REVERSE;
4065 slave_x->xlat_flags |=
4066 IPFW_STATE_F_XLATSRC |
4067 IPFW_STATE_F_XLATSLAVE;
4069 slave_x->xlat_pair = x;
4070 slave_x->xlat_pcpu = mycpuid;
4071 x->xlat_pair = slave_x;
4072 x->xlat_pcpu = cpuid;
4074 ctx->ipfw_xlated++;
4075 if (cpuid != mycpuid) {
4076 ctx->ipfw_xlate_split++;
4077 ipfw_xlate_redispatch(
4078 m, cpuid, x,
4079 IPFW_XLATE_INSERT |
4080 IPFW_XLATE_FORWARD);
4081 args->m = NULL;
4082 return (IP_FW_REDISPATCH);
4085 dup = ipfw_state_link(ctx,
4086 &slave_x->xlat_st);
4087 if (dup != NULL) {
4088 ctx->ipfw_xlate_conflicts++;
4089 if (IPFW_STATE_ISDEAD(dup)) {
4090 ipfw_state_remove(ctx,
4091 dup);
4092 dup = ipfw_state_link(
4093 ctx, &slave_x->xlat_st);
4095 if (dup != NULL) {
4096 if (bootverbose) {
4097 kprintf("ipfw: "
4098 "slave %u state "
4099 "conflicts "
4100 "%u state\n",
4101 x->xlat_type,
4102 s->st_type);
4104 ipfw_state_del(ctx, s);
4105 return (IP_FW_DENY);
4107 ctx->ipfw_xlate_cresolved++;
4110 match = 1;
4111 break;
4113 case O_PROBE_STATE:
4114 case O_CHECK_STATE:
4116 * States are checked at the first keep-state
4117 * check-state occurrence, with the result
4118 * being stored in dyn_dir. The compiler
4119 * introduces a PROBE_STATE instruction for
4120 * us when we have a KEEP_STATE/LIMIT/RDR
4121 * (because PROBE_STATE needs to be run first).
4123 s = NULL;
4124 if (dyn_dir == MATCH_UNKNOWN) {
4125 s = ipfw_state_lookup(ctx,
4126 &args->f_id, &dyn_dir, lc.tcp);
4128 if (s == NULL ||
4129 (s->st_type == O_REDIRECT &&
4130 (args->eh != NULL ||
4131 (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4132 (lc.proto != IPPROTO_TCP &&
4133 lc.proto != IPPROTO_UDP)))) {
4135 * State not found. If CHECK_STATE,
4136 * skip to next rule, if PROBE_STATE
4137 * just ignore and continue with next
4138 * opcode.
4140 if (cmd->opcode == O_CHECK_STATE)
4141 goto next_rule;
4142 match = 1;
4143 break;
4146 s->st_pcnt++;
4147 s->st_bcnt += lc.ip_len;
4149 if (s->st_type == O_REDIRECT) {
4150 struct ipfw_xlat *x =
4151 (struct ipfw_xlat *)s;
4153 if (oif != NULL &&
4154 x->xlat_ifp == NULL) {
4155 KASSERT(x->xlat_flags &
4156 IPFW_STATE_F_XLATSLAVE,
4157 ("master rdr state "
4158 "missing ifp"));
4159 x->xlat_ifp = oif;
4160 } else if (
4161 (oif != NULL && x->xlat_ifp!=oif) ||
4162 (oif == NULL &&
4163 x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4164 retval = IP_FW_DENY;
4165 goto done;
4167 if (x->xlat_dir != dyn_dir)
4168 goto skip_xlate;
4170 ipfw_xlate(x, m, NULL, NULL);
4171 m = ipfw_rehashm(m, hlen, args, &lc,
4172 &ip);
4173 if (m == NULL)
4174 goto pullup_failed;
4176 cpuid = netisr_hashcpu(
4177 m->m_pkthdr.hash);
4178 if (cpuid != mycpuid) {
4179 uint32_t xlate = 0;
4181 if (oif != NULL) {
4182 xlate |=
4183 IPFW_XLATE_OUTPUT;
4185 if (dyn_dir == MATCH_FORWARD) {
4186 xlate |=
4187 IPFW_XLATE_FORWARD;
4189 ipfw_xlate_redispatch(m, cpuid,
4190 x, xlate);
4191 args->m = NULL;
4192 return (IP_FW_REDISPATCH);
4195 KKASSERT(x->xlat_pcpu == mycpuid);
4196 ipfw_state_update(&args->f_id, dyn_dir,
4197 lc.tcp, &x->xlat_pair->xlat_st);
4199 skip_xlate:
4201 * Found a rule from a state; jump to the
4202 * 'action' part of the rule.
4204 f = s->st_rule;
4205 KKASSERT(f->cpuid == mycpuid);
4207 cmd = ACTION_PTR(f);
4208 l = f->cmd_len - f->act_ofs;
4209 dyn_f = f;
4210 goto check_body;
4212 case O_ACCEPT:
4213 retval = IP_FW_PASS; /* accept */
4214 goto done;
4216 case O_DEFRAG:
4217 if (f->cross_rules == NULL) {
4219 * This rule was not completely setup;
4220 * move on to the next rule.
4222 goto next_rule;
4226 * Don't defrag for l2 packets, output packets
4227 * or non-fragments.
4229 if (oif != NULL || args->eh != NULL ||
4230 (ip->ip_off & htons(IP_MF | IP_OFFMASK)) == 0)
4231 goto next_rule;
4233 ctx->ipfw_frags++;
4234 m = ip_reass(m);
4235 args->m = m;
4236 if (m == NULL) {
4237 retval = IP_FW_PASS;
4238 goto done;
4240 ctx->ipfw_defraged++;
4241 KASSERT((m->m_flags & M_HASH) == 0,
4242 ("hash not cleared"));
4244 /* Update statistics */
4245 f->pcnt++;
4246 f->bcnt += lc.ip_len;
4247 f->timestamp = time_second;
4249 ip = mtod(m, struct ip *);
4250 hlen = ip->ip_hl << 2;
4251 ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
4253 ip_hashfn(&m, 0);
4254 args->m = m;
4255 if (m == NULL)
4256 goto pullup_failed;
4258 KASSERT(m->m_flags & M_HASH, ("no hash"));
4259 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4260 if (cpuid != mycpuid) {
4261 ctx->ipfw_defrag_remote++;
4262 ipfw_defrag_redispatch(m, cpuid, f);
4263 args->m = NULL;
4264 return (IP_FW_REDISPATCH);
4267 /* 'm' might be changed by ip_hashfn(). */
4268 ip = mtod(m, struct ip *);
4270 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4271 if (m == NULL)
4272 goto pullup_failed;
4274 /* Move on. */
4275 goto next_rule;
4277 case O_PIPE:
4278 case O_QUEUE:
4279 args->rule = f; /* report matching rule */
4280 args->cookie = cmd->arg1;
4281 retval = IP_FW_DUMMYNET;
4282 goto done;
4284 case O_DIVERT:
4285 case O_TEE:
4286 if (args->eh) /* not on layer 2 */
4287 break;
4289 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4290 sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4291 if (mtag == NULL) {
4292 retval = IP_FW_DENY;
4293 goto done;
4295 divinfo = m_tag_data(mtag);
4297 divinfo->skipto = f->rulenum;
4298 divinfo->port = cmd->arg1;
4299 divinfo->tee = (cmd->opcode == O_TEE);
4300 m_tag_prepend(m, mtag);
4302 args->cookie = cmd->arg1;
4303 retval = (cmd->opcode == O_DIVERT) ?
4304 IP_FW_DIVERT : IP_FW_TEE;
4305 goto done;
4307 case O_COUNT:
4308 case O_SKIPTO:
4309 f->pcnt++; /* update stats */
4310 f->bcnt += lc.ip_len;
4311 f->timestamp = time_second;
4312 if (cmd->opcode == O_COUNT)
4313 goto next_rule;
4314 /* handle skipto */
4315 if (f->next_rule == NULL)
4316 lookup_next_rule(f);
4317 f = f->next_rule;
4318 goto again;
4320 case O_REJECT:
4322 * Drop the packet and send a reject notice
4323 * if the packet is not ICMP (or is an ICMP
4324 * query), and it is not multicast/broadcast.
4326 if (hlen > 0 &&
4327 (lc.proto != IPPROTO_ICMP ||
4328 is_icmp_query(ip)) &&
4329 !(m->m_flags & (M_BCAST|M_MCAST)) &&
4330 !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4331 send_reject(args, cmd->arg1,
4332 lc.offset, lc.ip_len);
4333 retval = IP_FW_DENY;
4334 goto done;
4336 /* FALLTHROUGH */
4337 case O_DENY:
4338 retval = IP_FW_DENY;
4339 goto done;
4341 case O_FORWARD_IP:
4342 if (args->eh) /* not valid on layer2 pkts */
4343 break;
4344 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4345 struct sockaddr_in *sin;
4347 mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4348 sizeof(*sin), M_INTWAIT | M_NULLOK);
4349 if (mtag == NULL) {
4350 retval = IP_FW_DENY;
4351 goto done;
4353 sin = m_tag_data(mtag);
4355 /* Structure copy */
4356 *sin = ((ipfw_insn_sa *)cmd)->sa;
4358 m_tag_prepend(m, mtag);
4359 m->m_pkthdr.fw_flags |=
4360 IPFORWARD_MBUF_TAGGED;
4361 m->m_pkthdr.fw_flags &=
4362 ~BRIDGE_MBUF_TAGGED;
4364 retval = IP_FW_PASS;
4365 goto done;
4367 default:
4368 panic("-- unknown opcode %d", cmd->opcode);
4369 } /* end of switch() on opcodes */
4371 if (cmd->len & F_NOT)
4372 match = !match;
4374 if (match) {
4375 if (cmd->len & F_OR)
4376 skip_or = 1;
4377 } else {
4378 if (!(cmd->len & F_OR)) /* not an OR block, */
4379 break; /* try next rule */
4382 } /* end of inner for, scan opcodes */
4384 next_rule:; /* try next rule */
4386 } /* end of outer for, scan rules */
4387 kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4388 return IP_FW_DENY;
4390 done:
4391 /* Update statistics */
4392 f->pcnt++;
4393 f->bcnt += lc.ip_len;
4394 f->timestamp = time_second;
4395 return retval;
4397 pullup_failed:
4398 if (fw_verbose)
4399 kprintf("pullup failed\n");
4400 return IP_FW_DENY;
4403 static struct mbuf *
4404 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4406 struct m_tag *mtag;
4407 struct dn_pkt *pkt;
4408 ipfw_insn *cmd;
4409 const struct ipfw_flow_id *id;
4410 struct dn_flow_id *fid;
4412 M_ASSERTPKTHDR(m);
4414 mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4415 M_INTWAIT | M_NULLOK);
4416 if (mtag == NULL) {
4417 m_freem(m);
4418 return (NULL);
4420 m_tag_prepend(m, mtag);
4422 pkt = m_tag_data(mtag);
4423 bzero(pkt, sizeof(*pkt));
4425 cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4426 if (cmd->opcode == O_LOG)
4427 cmd += F_LEN(cmd);
4428 KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4429 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4431 pkt->dn_m = m;
4432 pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4433 pkt->ifp = fwa->oif;
4434 pkt->pipe_nr = pipe_nr;
4436 pkt->cpuid = mycpuid;
4437 pkt->msgport = netisr_curport();
4439 id = &fwa->f_id;
4440 fid = &pkt->id;
4441 fid->fid_dst_ip = id->dst_ip;
4442 fid->fid_src_ip = id->src_ip;
4443 fid->fid_dst_port = id->dst_port;
4444 fid->fid_src_port = id->src_port;
4445 fid->fid_proto = id->proto;
4446 fid->fid_flags = id->flags;
4448 ipfw_ref_rule(fwa->rule);
4449 pkt->dn_priv = fwa->rule;
4450 pkt->dn_unref_priv = ipfw_unref_rule;
4452 if (cmd->opcode == O_PIPE)
4453 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4455 m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4456 return (m);
4460 * When a rule is added/deleted, clear the next_rule pointers in all rules.
4461 * These will be reconstructed on the fly as packets are matched.
4463 static void
4464 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4466 struct ip_fw *rule;
4468 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4469 rule->next_rule = NULL;
4472 static void
4473 ipfw_inc_static_count(struct ip_fw *rule)
4475 /* Static rule's counts are updated only on CPU0 */
4476 KKASSERT(mycpuid == 0);
4478 static_count++;
4479 static_ioc_len += IOC_RULESIZE(rule);
4482 static void
4483 ipfw_dec_static_count(struct ip_fw *rule)
4485 int l = IOC_RULESIZE(rule);
4487 /* Static rule's counts are updated only on CPU0 */
4488 KKASSERT(mycpuid == 0);
4490 KASSERT(static_count > 0, ("invalid static count %u", static_count));
4491 static_count--;
4493 KASSERT(static_ioc_len >= l,
4494 ("invalid static len %u", static_ioc_len));
4495 static_ioc_len -= l;
4498 static void
4499 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4501 if (fwmsg->sibling != NULL) {
4502 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4503 fwmsg->sibling->sibling = rule;
4505 fwmsg->sibling = rule;
4508 static struct ip_fw *
4509 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4511 struct ip_fw *rule;
4513 rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4515 rule->act_ofs = ioc_rule->act_ofs;
4516 rule->cmd_len = ioc_rule->cmd_len;
4517 rule->rulenum = ioc_rule->rulenum;
4518 rule->set = ioc_rule->set;
4519 rule->usr_flags = ioc_rule->usr_flags;
4521 bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4523 rule->refcnt = 1;
4524 rule->cpuid = mycpuid;
4525 rule->rule_flags = rule_flags;
4527 return rule;
4530 static void
4531 ipfw_add_rule_dispatch(netmsg_t nmsg)
4533 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4534 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4535 struct ip_fw *rule;
4537 ASSERT_NETISR_NCPUS(mycpuid);
4539 rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4542 * Insert rule into the pre-determined position
4544 if (fwmsg->prev_rule != NULL) {
4545 struct ip_fw *prev, *next;
4547 prev = fwmsg->prev_rule;
4548 KKASSERT(prev->cpuid == mycpuid);
4550 next = fwmsg->next_rule;
4551 KKASSERT(next->cpuid == mycpuid);
4553 rule->next = next;
4554 prev->next = rule;
4557 * Move to the position on the next CPU
4558 * before the msg is forwarded.
4560 fwmsg->prev_rule = prev->sibling;
4561 fwmsg->next_rule = next->sibling;
4562 } else {
4563 KKASSERT(fwmsg->next_rule == NULL);
4564 rule->next = ctx->ipfw_layer3_chain;
4565 ctx->ipfw_layer3_chain = rule;
4568 /* Link rule CPU sibling */
4569 ipfw_link_sibling(fwmsg, rule);
4571 ipfw_flush_rule_ptrs(ctx);
4573 if (mycpuid == 0) {
4574 /* Statistics only need to be updated once */
4575 ipfw_inc_static_count(rule);
4577 /* Return the rule on CPU0 */
4578 nmsg->lmsg.u.ms_resultp = rule;
4581 if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4582 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4584 if (fwmsg->cross_rules != NULL) {
4585 /* Save rules for later use. */
4586 fwmsg->cross_rules[mycpuid] = rule;
4589 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4592 static void
4593 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4595 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4596 struct ip_fw *rule = fwmsg->sibling;
4597 int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4599 ASSERT_NETISR_NCPUS(mycpuid);
4600 KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4601 ("not crossref rule"));
4603 rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4604 memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4606 fwmsg->sibling = rule->sibling;
4607 netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4611 * Add a new rule to the list. Copy the rule into a malloc'ed area,
4612 * then possibly create a rule number and add the rule to the list.
4613 * Update the rule_number in the input struct so the caller knows
4614 * it as well.
4616 static void
4617 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4619 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4620 struct netmsg_ipfw fwmsg;
4621 struct ip_fw *f, *prev, *rule;
4623 ASSERT_NETISR0;
4626 * If rulenum is 0, find highest numbered rule before the
4627 * default rule, and add rule number incremental step.
4629 if (ioc_rule->rulenum == 0) {
4630 int step = autoinc_step;
4632 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4633 step <= IPFW_AUTOINC_STEP_MAX);
4636 * Locate the highest numbered rule before default
4638 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4639 if (f->rulenum == IPFW_DEFAULT_RULE)
4640 break;
4641 ioc_rule->rulenum = f->rulenum;
4643 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4644 ioc_rule->rulenum += step;
4646 KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4647 ioc_rule->rulenum != 0,
4648 ("invalid rule num %d", ioc_rule->rulenum));
4651 * Now find the right place for the new rule in the sorted list.
4653 for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4654 prev = f, f = f->next) {
4655 if (f->rulenum > ioc_rule->rulenum) {
4656 /* Found the location */
4657 break;
4660 KASSERT(f != NULL, ("no default rule?!"));
4663 * Duplicate the rule onto each CPU.
4664 * The rule duplicated on CPU0 will be returned.
4666 bzero(&fwmsg, sizeof(fwmsg));
4667 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4668 ipfw_add_rule_dispatch);
4669 fwmsg.ioc_rule = ioc_rule;
4670 fwmsg.prev_rule = prev;
4671 fwmsg.next_rule = prev == NULL ? NULL : f;
4672 fwmsg.rule_flags = rule_flags;
4673 if (rule_flags & IPFW_RULE_F_CROSSREF) {
4674 fwmsg.cross_rules = kmalloc(
4675 sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4676 M_WAITOK | M_ZERO);
4679 netisr_domsg_global(&fwmsg.base);
4680 KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4682 rule = fwmsg.base.lmsg.u.ms_resultp;
4683 KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4685 if (fwmsg.cross_rules != NULL) {
4686 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4687 MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4688 fwmsg.sibling = rule;
4689 netisr_domsg_global(&fwmsg.base);
4690 KKASSERT(fwmsg.sibling == NULL);
4692 kfree(fwmsg.cross_rules, M_TEMP);
4694 #ifdef KLD_MODULE
4695 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4696 #endif
4699 DPRINTF("++ installed rule %d, static count now %d\n",
4700 rule->rulenum, static_count);
4704 * Free storage associated with a static rule (including derived
4705 * states/tracks).
4706 * The caller is in charge of clearing rule pointers to avoid
4707 * dangling pointers.
4708 * @return a pointer to the next entry.
4709 * Arguments are not checked, so they better be correct.
4711 static struct ip_fw *
4712 ipfw_delete_rule(struct ipfw_context *ctx,
4713 struct ip_fw *prev, struct ip_fw *rule)
4715 struct ip_fw *n;
4717 n = rule->next;
4718 if (prev == NULL)
4719 ctx->ipfw_layer3_chain = n;
4720 else
4721 prev->next = n;
4723 /* Mark the rule as invalid */
4724 rule->rule_flags |= IPFW_RULE_F_INVALID;
4725 rule->next_rule = NULL;
4726 rule->sibling = NULL;
4727 #ifdef foo
4728 /* Don't reset cpuid here; keep various assertion working */
4729 rule->cpuid = -1;
4730 #endif
4732 /* Statistics only need to be updated once */
4733 if (mycpuid == 0)
4734 ipfw_dec_static_count(rule);
4736 if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4737 /* Try to free this rule */
4738 ipfw_free_rule(rule);
4739 } else {
4740 /* TODO: check staging area. */
4741 if (mycpuid == 0) {
4742 rule->next = ipfw_gd.ipfw_crossref_free;
4743 ipfw_gd.ipfw_crossref_free = rule;
4747 /* Return the next rule */
4748 return n;
4751 static void
4752 ipfw_flush_dispatch(netmsg_t nmsg)
4754 int kill_default = nmsg->lmsg.u.ms_result;
4755 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4756 struct ip_fw *rule;
4758 ASSERT_NETISR_NCPUS(mycpuid);
4761 * Flush states.
4763 ipfw_state_flush(ctx, NULL);
4764 KASSERT(ctx->ipfw_state_cnt == 0,
4765 ("%d pcpu states remain", ctx->ipfw_state_cnt));
4766 ctx->ipfw_state_loosecnt = 0;
4767 ctx->ipfw_state_lastexp = 0;
4770 * Flush tracks.
4772 ipfw_track_flush(ctx, NULL);
4773 ctx->ipfw_track_lastexp = 0;
4774 if (ctx->ipfw_trkcnt_spare != NULL) {
4775 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4776 ctx->ipfw_trkcnt_spare = NULL;
4779 ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4781 while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4782 (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4783 ipfw_delete_rule(ctx, NULL, rule);
4785 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4789 * Deletes all rules from a chain (including the default rule
4790 * if the second argument is set).
4792 static void
4793 ipfw_flush(int kill_default)
4795 struct netmsg_base nmsg;
4796 #ifdef INVARIANTS
4797 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4798 int state_cnt;
4799 #endif
4801 ASSERT_NETISR0;
4804 * If 'kill_default' then caller has done the necessary
4805 * msgport syncing; unnecessary to do it again.
4807 if (!kill_default) {
4809 * Let ipfw_chk() know the rules are going to
4810 * be flushed, so it could jump directly to
4811 * the default rule.
4813 ipfw_flushing = 1;
4814 /* XXX use priority sync */
4815 netmsg_service_sync();
4819 * Press the 'flush' button
4821 bzero(&nmsg, sizeof(nmsg));
4822 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4823 ipfw_flush_dispatch);
4824 nmsg.lmsg.u.ms_result = kill_default;
4825 netisr_domsg_global(&nmsg);
4826 ipfw_gd.ipfw_state_loosecnt = 0;
4827 ipfw_gd.ipfw_state_globexp = 0;
4828 ipfw_gd.ipfw_track_globexp = 0;
4830 #ifdef INVARIANTS
4831 state_cnt = ipfw_state_cntcoll();
4832 KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4834 KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4835 ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4837 if (kill_default) {
4838 KASSERT(static_count == 0,
4839 ("%u static rules remain", static_count));
4840 KASSERT(static_ioc_len == 0,
4841 ("%u bytes of static rules remain", static_ioc_len));
4842 } else {
4843 KASSERT(static_count == 1,
4844 ("%u static rules remain", static_count));
4845 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4846 ("%u bytes of static rules remain, should be %lu",
4847 static_ioc_len,
4848 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4850 #endif
4852 /* Flush is done */
4853 ipfw_flushing = 0;
4856 static void
4857 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4859 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4860 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4861 struct ip_fw *rule, *prev;
4863 ASSERT_NETISR_NCPUS(mycpuid);
4865 rule = dmsg->start_rule;
4866 KKASSERT(rule->cpuid == mycpuid);
4867 dmsg->start_rule = rule->sibling;
4869 prev = dmsg->prev_rule;
4870 if (prev != NULL) {
4871 KKASSERT(prev->cpuid == mycpuid);
4874 * Move to the position on the next CPU
4875 * before the msg is forwarded.
4877 dmsg->prev_rule = prev->sibling;
4881 * flush pointers outside the loop, then delete all matching
4882 * rules. 'prev' remains the same throughout the cycle.
4884 ipfw_flush_rule_ptrs(ctx);
4885 while (rule && rule->rulenum == dmsg->rulenum) {
4886 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4887 /* Flush states generated by this rule. */
4888 ipfw_state_flush(ctx, rule);
4890 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4891 /* Flush tracks generated by this rule. */
4892 ipfw_track_flush(ctx, rule);
4894 rule = ipfw_delete_rule(ctx, prev, rule);
4897 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4900 static int
4901 ipfw_alt_delete_rule(uint16_t rulenum)
4903 struct ip_fw *prev, *rule;
4904 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4905 struct netmsg_del dmsg;
4907 ASSERT_NETISR0;
4910 * Locate first rule to delete
4912 for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4913 rule && rule->rulenum < rulenum;
4914 prev = rule, rule = rule->next)
4915 ; /* EMPTY */
4916 if (rule->rulenum != rulenum)
4917 return EINVAL;
4920 * Get rid of the rule duplications on all CPUs
4922 bzero(&dmsg, sizeof(dmsg));
4923 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4924 ipfw_alt_delete_rule_dispatch);
4925 dmsg.prev_rule = prev;
4926 dmsg.start_rule = rule;
4927 dmsg.rulenum = rulenum;
4929 netisr_domsg_global(&dmsg.base);
4930 KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4931 return 0;
4934 static void
4935 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4937 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4938 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4939 struct ip_fw *prev, *rule;
4940 #ifdef INVARIANTS
4941 int del = 0;
4942 #endif
4944 ASSERT_NETISR_NCPUS(mycpuid);
4946 ipfw_flush_rule_ptrs(ctx);
4948 prev = NULL;
4949 rule = ctx->ipfw_layer3_chain;
4950 while (rule != NULL) {
4951 if (rule->set == dmsg->from_set) {
4952 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4953 /* Flush states generated by this rule. */
4954 ipfw_state_flush(ctx, rule);
4956 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4957 /* Flush tracks generated by this rule. */
4958 ipfw_track_flush(ctx, rule);
4960 rule = ipfw_delete_rule(ctx, prev, rule);
4961 #ifdef INVARIANTS
4962 del = 1;
4963 #endif
4964 } else {
4965 prev = rule;
4966 rule = rule->next;
4969 KASSERT(del, ("no match set?!"));
4971 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4974 static int
4975 ipfw_alt_delete_ruleset(uint8_t set)
4977 struct netmsg_del dmsg;
4978 int del;
4979 struct ip_fw *rule;
4980 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4982 ASSERT_NETISR0;
4985 * Check whether the 'set' exists. If it exists,
4986 * then check whether any rules within the set will
4987 * try to create states.
4989 del = 0;
4990 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4991 if (rule->set == set)
4992 del = 1;
4994 if (!del)
4995 return 0; /* XXX EINVAL? */
4998 * Delete this set
5000 bzero(&dmsg, sizeof(dmsg));
5001 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5002 ipfw_alt_delete_ruleset_dispatch);
5003 dmsg.from_set = set;
5004 netisr_domsg_global(&dmsg.base);
5006 return 0;
5009 static void
5010 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5012 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5013 struct ip_fw *rule;
5015 ASSERT_NETISR_NCPUS(mycpuid);
5017 rule = dmsg->start_rule;
5018 KKASSERT(rule->cpuid == mycpuid);
5021 * Move to the position on the next CPU
5022 * before the msg is forwarded.
5024 dmsg->start_rule = rule->sibling;
5026 while (rule && rule->rulenum <= dmsg->rulenum) {
5027 if (rule->rulenum == dmsg->rulenum)
5028 rule->set = dmsg->to_set;
5029 rule = rule->next;
5031 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5034 static int
5035 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5037 struct netmsg_del dmsg;
5038 struct netmsg_base *nmsg;
5039 struct ip_fw *rule;
5040 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5042 ASSERT_NETISR0;
5045 * Locate first rule to move
5047 for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5048 rule = rule->next) {
5049 if (rule->rulenum == rulenum && rule->set != set)
5050 break;
5052 if (rule == NULL || rule->rulenum > rulenum)
5053 return 0; /* XXX error? */
5055 bzero(&dmsg, sizeof(dmsg));
5056 nmsg = &dmsg.base;
5057 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5058 ipfw_alt_move_rule_dispatch);
5059 dmsg.start_rule = rule;
5060 dmsg.rulenum = rulenum;
5061 dmsg.to_set = set;
5063 netisr_domsg_global(nmsg);
5064 KKASSERT(dmsg.start_rule == NULL);
5065 return 0;
5068 static void
5069 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5071 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5072 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5073 struct ip_fw *rule;
5075 ASSERT_NETISR_NCPUS(mycpuid);
5077 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5078 if (rule->set == dmsg->from_set)
5079 rule->set = dmsg->to_set;
5081 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5084 static int
5085 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5087 struct netmsg_del dmsg;
5088 struct netmsg_base *nmsg;
5090 ASSERT_NETISR0;
5092 bzero(&dmsg, sizeof(dmsg));
5093 nmsg = &dmsg.base;
5094 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5095 ipfw_alt_move_ruleset_dispatch);
5096 dmsg.from_set = from_set;
5097 dmsg.to_set = to_set;
5099 netisr_domsg_global(nmsg);
5100 return 0;
5103 static void
5104 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5106 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5107 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5108 struct ip_fw *rule;
5110 ASSERT_NETISR_NCPUS(mycpuid);
5112 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5113 if (rule->set == dmsg->from_set)
5114 rule->set = dmsg->to_set;
5115 else if (rule->set == dmsg->to_set)
5116 rule->set = dmsg->from_set;
5118 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5121 static int
5122 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5124 struct netmsg_del dmsg;
5125 struct netmsg_base *nmsg;
5127 ASSERT_NETISR0;
5129 bzero(&dmsg, sizeof(dmsg));
5130 nmsg = &dmsg.base;
5131 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5132 ipfw_alt_swap_ruleset_dispatch);
5133 dmsg.from_set = set1;
5134 dmsg.to_set = set2;
5136 netisr_domsg_global(nmsg);
5137 return 0;
5141 * Remove all rules with given number, and also do set manipulation.
5143 * The argument is an uint32_t. The low 16 bit are the rule or set number,
5144 * the next 8 bits are the new set, the top 8 bits are the command:
5146 * 0 delete rules with given number
5147 * 1 delete rules with given set number
5148 * 2 move rules with given number to new set
5149 * 3 move rules with given set number to new set
5150 * 4 swap sets with given numbers
5152 static int
5153 ipfw_ctl_alter(uint32_t arg)
5155 uint16_t rulenum;
5156 uint8_t cmd, new_set;
5157 int error = 0;
5159 ASSERT_NETISR0;
5161 rulenum = arg & 0xffff;
5162 cmd = (arg >> 24) & 0xff;
5163 new_set = (arg >> 16) & 0xff;
5165 if (cmd > 4)
5166 return EINVAL;
5167 if (new_set >= IPFW_DEFAULT_SET)
5168 return EINVAL;
5169 if (cmd == 0 || cmd == 2) {
5170 if (rulenum == IPFW_DEFAULT_RULE)
5171 return EINVAL;
5172 } else {
5173 if (rulenum >= IPFW_DEFAULT_SET)
5174 return EINVAL;
5177 switch (cmd) {
5178 case 0: /* delete rules with given number */
5179 error = ipfw_alt_delete_rule(rulenum);
5180 break;
5182 case 1: /* delete all rules with given set number */
5183 error = ipfw_alt_delete_ruleset(rulenum);
5184 break;
5186 case 2: /* move rules with given number to new set */
5187 error = ipfw_alt_move_rule(rulenum, new_set);
5188 break;
5190 case 3: /* move rules with given set number to new set */
5191 error = ipfw_alt_move_ruleset(rulenum, new_set);
5192 break;
5194 case 4: /* swap two sets */
5195 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5196 break;
5198 return error;
5202 * Clear counters for a specific rule.
5204 static void
5205 clear_counters(struct ip_fw *rule, int log_only)
5207 ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5209 if (log_only == 0) {
5210 rule->bcnt = rule->pcnt = 0;
5211 rule->timestamp = 0;
5213 if (l->o.opcode == O_LOG)
5214 l->log_left = l->max_log;
5217 static void
5218 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5220 struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5221 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5222 struct ip_fw *rule;
5224 ASSERT_NETISR_NCPUS(mycpuid);
5226 if (zmsg->rulenum == 0) {
5227 KKASSERT(zmsg->start_rule == NULL);
5229 ctx->ipfw_norule_counter = 0;
5230 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5231 clear_counters(rule, zmsg->log_only);
5232 } else {
5233 struct ip_fw *start = zmsg->start_rule;
5235 KKASSERT(start->cpuid == mycpuid);
5236 KKASSERT(start->rulenum == zmsg->rulenum);
5239 * We can have multiple rules with the same number, so we
5240 * need to clear them all.
5242 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5243 rule = rule->next)
5244 clear_counters(rule, zmsg->log_only);
5247 * Move to the position on the next CPU
5248 * before the msg is forwarded.
5250 zmsg->start_rule = start->sibling;
5252 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5256 * Reset some or all counters on firewall rules.
5257 * @arg frwl is null to clear all entries, or contains a specific
5258 * rule number.
5259 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5261 static int
5262 ipfw_ctl_zero_entry(int rulenum, int log_only)
5264 struct netmsg_zent zmsg;
5265 struct netmsg_base *nmsg;
5266 const char *msg;
5267 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5269 ASSERT_NETISR0;
5271 bzero(&zmsg, sizeof(zmsg));
5272 nmsg = &zmsg.base;
5273 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5274 ipfw_zero_entry_dispatch);
5275 zmsg.log_only = log_only;
5277 if (rulenum == 0) {
5278 msg = log_only ? "ipfw: All logging counts reset.\n"
5279 : "ipfw: Accounting cleared.\n";
5280 } else {
5281 struct ip_fw *rule;
5284 * Locate the first rule with 'rulenum'
5286 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5287 if (rule->rulenum == rulenum)
5288 break;
5290 if (rule == NULL) /* we did not find any matching rules */
5291 return (EINVAL);
5292 zmsg.start_rule = rule;
5293 zmsg.rulenum = rulenum;
5295 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5296 : "ipfw: Entry %d cleared.\n";
5298 netisr_domsg_global(nmsg);
5299 KKASSERT(zmsg.start_rule == NULL);
5301 if (fw_verbose)
5302 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5303 return (0);
5307 * Check validity of the structure before insert.
5308 * Fortunately rules are simple, so this mostly need to check rule sizes.
5310 static int
5311 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5313 int l, cmdlen = 0;
5314 int have_action = 0;
5315 ipfw_insn *cmd;
5317 *rule_flags = 0;
5319 /* Check for valid size */
5320 if (size < sizeof(*rule)) {
5321 kprintf("ipfw: rule too short\n");
5322 return EINVAL;
5324 l = IOC_RULESIZE(rule);
5325 if (l != size) {
5326 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5327 return EINVAL;
5330 /* Check rule number */
5331 if (rule->rulenum == IPFW_DEFAULT_RULE) {
5332 kprintf("ipfw: invalid rule number\n");
5333 return EINVAL;
5337 * Now go for the individual checks. Very simple ones, basically only
5338 * instruction sizes.
5340 for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5341 l -= cmdlen, cmd += cmdlen) {
5342 cmdlen = F_LEN(cmd);
5343 if (cmdlen > l) {
5344 kprintf("ipfw: opcode %d size truncated\n",
5345 cmd->opcode);
5346 return EINVAL;
5349 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5351 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5352 IPFW_ISXLAT(cmd->opcode)) {
5353 /* This rule will generate states. */
5354 *rule_flags |= IPFW_RULE_F_GENSTATE;
5355 if (cmd->opcode == O_LIMIT)
5356 *rule_flags |= IPFW_RULE_F_GENTRACK;
5358 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5359 *rule_flags |= IPFW_RULE_F_CROSSREF;
5360 if (cmd->opcode == O_IP_SRC_IFIP ||
5361 cmd->opcode == O_IP_DST_IFIP) {
5362 *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5363 cmd->arg1 &= IPFW_IFIP_SETTINGS;
5366 switch (cmd->opcode) {
5367 case O_NOP:
5368 case O_PROBE_STATE:
5369 case O_KEEP_STATE:
5370 case O_PROTO:
5371 case O_IP_SRC_ME:
5372 case O_IP_DST_ME:
5373 case O_LAYER2:
5374 case O_IN:
5375 case O_FRAG:
5376 case O_IPFRAG:
5377 case O_IPOPT:
5378 case O_IPLEN:
5379 case O_IPID:
5380 case O_IPTOS:
5381 case O_IPPRECEDENCE:
5382 case O_IPTTL:
5383 case O_IPVER:
5384 case O_TCPWIN:
5385 case O_TCPFLAGS:
5386 case O_TCPOPTS:
5387 case O_ESTAB:
5388 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5389 goto bad_size;
5390 break;
5392 case O_IP_SRC_TABLE:
5393 case O_IP_DST_TABLE:
5394 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5395 goto bad_size;
5396 if (cmd->arg1 >= ipfw_table_max) {
5397 kprintf("ipfw: invalid table id %u, max %d\n",
5398 cmd->arg1, ipfw_table_max);
5399 return EINVAL;
5401 break;
5403 case O_IP_SRC_IFIP:
5404 case O_IP_DST_IFIP:
5405 if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5406 goto bad_size;
5407 break;
5409 case O_ICMPCODE:
5410 case O_ICMPTYPE:
5411 if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5412 goto bad_size;
5413 break;
5415 case O_UID:
5416 case O_GID:
5417 case O_IP_SRC:
5418 case O_IP_DST:
5419 case O_TCPSEQ:
5420 case O_TCPACK:
5421 case O_PROB:
5422 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5423 goto bad_size;
5424 break;
5426 case O_LIMIT:
5427 if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5428 goto bad_size;
5429 break;
5430 case O_REDIRECT:
5431 if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5432 goto bad_size;
5433 break;
5435 case O_LOG:
5436 if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5437 goto bad_size;
5439 ((ipfw_insn_log *)cmd)->log_left =
5440 ((ipfw_insn_log *)cmd)->max_log;
5442 break;
5444 case O_IP_SRC_MASK:
5445 case O_IP_DST_MASK:
5446 if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5447 goto bad_size;
5448 if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5449 kprintf("ipfw: opcode %d, useless rule\n",
5450 cmd->opcode);
5451 return EINVAL;
5453 break;
5455 case O_IP_SRC_SET:
5456 case O_IP_DST_SET:
5457 if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5458 kprintf("ipfw: invalid set size %d\n",
5459 cmd->arg1);
5460 return EINVAL;
5462 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5463 (cmd->arg1+31)/32 )
5464 goto bad_size;
5465 break;
5467 case O_MACADDR2:
5468 if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5469 goto bad_size;
5470 break;
5472 case O_MAC_TYPE:
5473 case O_IP_SRCPORT:
5474 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5475 if (cmdlen < 2 || cmdlen > 31)
5476 goto bad_size;
5477 break;
5479 case O_RECV:
5480 case O_XMIT:
5481 case O_VIA:
5482 if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5483 goto bad_size;
5484 break;
5486 case O_PIPE:
5487 case O_QUEUE:
5488 if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5489 goto bad_size;
5490 goto check_action;
5492 case O_FORWARD_IP:
5493 if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5494 goto bad_size;
5495 } else {
5496 in_addr_t fwd_addr;
5498 fwd_addr = ((ipfw_insn_sa *)cmd)->
5499 sa.sin_addr.s_addr;
5500 if (IN_MULTICAST(ntohl(fwd_addr))) {
5501 kprintf("ipfw: try forwarding to "
5502 "multicast address\n");
5503 return EINVAL;
5506 goto check_action;
5508 case O_FORWARD_MAC: /* XXX not implemented yet */
5509 case O_CHECK_STATE:
5510 case O_COUNT:
5511 case O_ACCEPT:
5512 case O_DENY:
5513 case O_REJECT:
5514 case O_SKIPTO:
5515 case O_DIVERT:
5516 case O_TEE:
5517 case O_DEFRAG:
5518 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5519 goto bad_size;
5520 check_action:
5521 if (have_action) {
5522 kprintf("ipfw: opcode %d, multiple actions"
5523 " not allowed\n",
5524 cmd->opcode);
5525 return EINVAL;
5527 have_action = 1;
5528 if (l != cmdlen) {
5529 kprintf("ipfw: opcode %d, action must be"
5530 " last opcode\n",
5531 cmd->opcode);
5532 return EINVAL;
5534 break;
5535 default:
5536 kprintf("ipfw: opcode %d, unknown opcode\n",
5537 cmd->opcode);
5538 return EINVAL;
5541 if (have_action == 0) {
5542 kprintf("ipfw: missing action\n");
5543 return EINVAL;
5545 return 0;
5547 bad_size:
5548 kprintf("ipfw: opcode %d size %d wrong\n",
5549 cmd->opcode, cmdlen);
5550 return EINVAL;
5553 static int
5554 ipfw_ctl_add_rule(struct sockopt *sopt)
5556 struct ipfw_ioc_rule *ioc_rule;
5557 size_t size;
5558 uint32_t rule_flags;
5559 int error;
5561 ASSERT_NETISR0;
5563 size = sopt->sopt_valsize;
5564 if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5565 size < sizeof(*ioc_rule)) {
5566 return EINVAL;
5568 if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5569 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5570 IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5572 ioc_rule = sopt->sopt_val;
5574 error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5575 if (error)
5576 return error;
5578 ipfw_add_rule(ioc_rule, rule_flags);
5580 if (sopt->sopt_dir == SOPT_GET)
5581 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5582 return 0;
5585 static void *
5586 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5587 struct ipfw_ioc_rule *ioc_rule)
5589 const struct ip_fw *sibling;
5590 #ifdef INVARIANTS
5591 int i;
5592 #endif
5594 ASSERT_NETISR0;
5595 KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5597 ioc_rule->act_ofs = rule->act_ofs;
5598 ioc_rule->cmd_len = rule->cmd_len;
5599 ioc_rule->rulenum = rule->rulenum;
5600 ioc_rule->set = rule->set;
5601 ioc_rule->usr_flags = rule->usr_flags;
5603 ioc_rule->set_disable = ctx->ipfw_set_disable;
5604 ioc_rule->static_count = static_count;
5605 ioc_rule->static_len = static_ioc_len;
5608 * Visit (read-only) all of the rule's duplications to get
5609 * the necessary statistics
5611 #ifdef INVARIANTS
5612 i = 0;
5613 #endif
5614 ioc_rule->pcnt = 0;
5615 ioc_rule->bcnt = 0;
5616 ioc_rule->timestamp = 0;
5617 for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5618 ioc_rule->pcnt += sibling->pcnt;
5619 ioc_rule->bcnt += sibling->bcnt;
5620 if (sibling->timestamp > ioc_rule->timestamp)
5621 ioc_rule->timestamp = sibling->timestamp;
5622 #ifdef INVARIANTS
5623 ++i;
5624 #endif
5626 KASSERT(i == netisr_ncpus,
5627 ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5629 bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5631 return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5634 static boolean_t
5635 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5637 struct ipfw_ioc_flowid *ioc_id;
5639 if (trk->tc_expire == 0) {
5640 /* Not a scanned one. */
5641 return (FALSE);
5644 ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5645 0 : trk->tc_expire - time_uptime;
5646 ioc_state->pcnt = 0;
5647 ioc_state->bcnt = 0;
5649 ioc_state->dyn_type = O_LIMIT_PARENT;
5650 ioc_state->count = trk->tc_count;
5652 ioc_state->rulenum = trk->tc_rulenum;
5654 ioc_id = &ioc_state->id;
5655 ioc_id->type = ETHERTYPE_IP;
5656 ioc_id->u.ip.proto = trk->tc_proto;
5657 ioc_id->u.ip.src_ip = trk->tc_saddr;
5658 ioc_id->u.ip.dst_ip = trk->tc_daddr;
5659 ioc_id->u.ip.src_port = trk->tc_sport;
5660 ioc_id->u.ip.dst_port = trk->tc_dport;
5662 return (TRUE);
5665 static boolean_t
5666 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5668 struct ipfw_ioc_flowid *ioc_id;
5670 if (IPFW_STATE_SCANSKIP(s))
5671 return (FALSE);
5673 ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5674 0 : s->st_expire - time_uptime;
5675 ioc_state->pcnt = s->st_pcnt;
5676 ioc_state->bcnt = s->st_bcnt;
5678 ioc_state->dyn_type = s->st_type;
5679 ioc_state->count = 0;
5681 ioc_state->rulenum = s->st_rule->rulenum;
5683 ioc_id = &ioc_state->id;
5684 ioc_id->type = ETHERTYPE_IP;
5685 ioc_id->u.ip.proto = s->st_proto;
5686 ipfw_key_4tuple(&s->st_key,
5687 &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5688 &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5690 if (IPFW_ISXLAT(s->st_type)) {
5691 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5693 if (x->xlat_port == 0)
5694 ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5695 else
5696 ioc_state->xlat_port = ntohs(x->xlat_port);
5697 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5699 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5700 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5703 return (TRUE);
5706 static void
5707 ipfw_state_copy_dispatch(netmsg_t nmsg)
5709 struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5710 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5711 const struct ipfw_state *s;
5712 const struct ipfw_track *t;
5714 ASSERT_NETISR_NCPUS(mycpuid);
5715 KASSERT(nm->state_cnt < nm->state_cntmax,
5716 ("invalid state count %d, max %d",
5717 nm->state_cnt, nm->state_cntmax));
5719 TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5720 if (ipfw_state_copy(s, nm->ioc_state)) {
5721 nm->ioc_state++;
5722 nm->state_cnt++;
5723 if (nm->state_cnt == nm->state_cntmax)
5724 goto done;
5729 * Prepare tracks in the global track tree for userland.
5731 TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5732 struct ipfw_trkcnt *trk;
5734 if (t->t_count == NULL) /* anchor */
5735 continue;
5736 trk = t->t_trkcnt;
5739 * Only one netisr can run this function at
5740 * any time, and only this function accesses
5741 * trkcnt's tc_expire, so this is safe w/o
5742 * ipfw_gd.ipfw_trkcnt_token.
5744 if (trk->tc_expire > t->t_expire)
5745 continue;
5746 trk->tc_expire = t->t_expire;
5750 * Copy tracks in the global track tree to userland in
5751 * the last netisr.
5753 if (mycpuid == netisr_ncpus - 1) {
5754 struct ipfw_trkcnt *trk;
5756 KASSERT(nm->state_cnt < nm->state_cntmax,
5757 ("invalid state count %d, max %d",
5758 nm->state_cnt, nm->state_cntmax));
5760 IPFW_TRKCNT_TOKGET;
5761 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5762 if (ipfw_track_copy(trk, nm->ioc_state)) {
5763 nm->ioc_state++;
5764 nm->state_cnt++;
5765 if (nm->state_cnt == nm->state_cntmax) {
5766 IPFW_TRKCNT_TOKREL;
5767 goto done;
5771 IPFW_TRKCNT_TOKREL;
5773 done:
5774 if (nm->state_cnt == nm->state_cntmax) {
5775 /* No more space; done. */
5776 netisr_replymsg(&nm->base, 0);
5777 } else {
5778 netisr_forwardmsg(&nm->base, mycpuid + 1);
5782 static int
5783 ipfw_ctl_get_rules(struct sockopt *sopt)
5785 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5786 struct ip_fw *rule;
5787 void *bp;
5788 size_t size;
5789 int state_cnt;
5791 ASSERT_NETISR0;
5794 * pass up a copy of the current rules. Static rules
5795 * come first (the last of which has number IPFW_DEFAULT_RULE),
5796 * followed by a possibly empty list of states.
5799 size = static_ioc_len; /* size of static rules */
5802 * Size of the states.
5803 * XXX take tracks as state for userland compat.
5805 state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5806 state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5807 size += state_cnt * sizeof(struct ipfw_ioc_state);
5809 if (sopt->sopt_valsize < size) {
5810 /* short length, no need to return incomplete rules */
5811 /* XXX: if superuser, no need to zero buffer */
5812 bzero(sopt->sopt_val, sopt->sopt_valsize);
5813 return 0;
5815 bp = sopt->sopt_val;
5817 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5818 bp = ipfw_copy_rule(ctx, rule, bp);
5820 if (state_cnt) {
5821 struct netmsg_cpstate nm;
5822 #ifdef INVARIANTS
5823 size_t old_size = size;
5824 #endif
5826 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5827 MSGF_PRIORITY, ipfw_state_copy_dispatch);
5828 nm.ioc_state = bp;
5829 nm.state_cntmax = state_cnt;
5830 nm.state_cnt = 0;
5831 netisr_domsg_global(&nm.base);
5834 * The # of states may be shrinked after the snapshot
5835 * of the state count was taken. To give user a correct
5836 * state count, nm->state_cnt is used to recalculate
5837 * the actual size.
5839 size = static_ioc_len +
5840 (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5841 KKASSERT(size <= old_size);
5844 sopt->sopt_valsize = size;
5845 return 0;
5848 static void
5849 ipfw_set_disable_dispatch(netmsg_t nmsg)
5851 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5853 ASSERT_NETISR_NCPUS(mycpuid);
5855 ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5856 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5859 static void
5860 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5862 struct netmsg_base nmsg;
5863 uint32_t set_disable;
5865 ASSERT_NETISR0;
5867 /* IPFW_DEFAULT_SET is always enabled */
5868 enable |= (1 << IPFW_DEFAULT_SET);
5869 set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5871 bzero(&nmsg, sizeof(nmsg));
5872 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5873 ipfw_set_disable_dispatch);
5874 nmsg.lmsg.u.ms_result32 = set_disable;
5876 netisr_domsg_global(&nmsg);
5879 static void
5880 ipfw_table_create_dispatch(netmsg_t nm)
5882 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5883 int tblid = nm->lmsg.u.ms_result;
5885 ASSERT_NETISR_NCPUS(mycpuid);
5887 if (!rn_inithead(&ctx->ipfw_tables[tblid], rn_cpumaskhead(mycpuid),
5888 offsetof(struct sockaddr_in, sin_addr)))
5889 panic("ipfw: create table%d failed", tblid);
5891 netisr_forwardmsg(&nm->base, mycpuid + 1);
5894 static int
5895 ipfw_table_create(struct sockopt *sopt)
5897 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5898 struct ipfw_ioc_table *tbl;
5899 struct netmsg_base nm;
5901 ASSERT_NETISR0;
5903 if (sopt->sopt_valsize != sizeof(*tbl))
5904 return (EINVAL);
5906 tbl = sopt->sopt_val;
5907 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5908 return (EINVAL);
5910 if (ctx->ipfw_tables[tbl->tableid] != NULL)
5911 return (EEXIST);
5913 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5914 ipfw_table_create_dispatch);
5915 nm.lmsg.u.ms_result = tbl->tableid;
5916 netisr_domsg_global(&nm);
5918 return (0);
5921 static void
5922 ipfw_table_killent(struct radix_node *rn)
5924 struct ipfw_tblent *te;
5926 te = (struct ipfw_tblent *)rn;
5927 kfree(te, M_IPFW);
5930 static void
5931 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5932 int destroy)
5934 struct radix_node_head *rnh;
5936 ASSERT_NETISR_NCPUS(mycpuid);
5938 rnh = ctx->ipfw_tables[tableid];
5939 rn_flush(rnh, ipfw_table_killent);
5940 if (destroy) {
5941 rn_freehead(rnh);
5942 ctx->ipfw_tables[tableid] = NULL;
5946 static void
5947 ipfw_table_flush_dispatch(netmsg_t nmsg)
5949 struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5950 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5952 ASSERT_NETISR_NCPUS(mycpuid);
5954 ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5955 netisr_forwardmsg(&nm->base, mycpuid + 1);
5958 static void
5959 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5961 int i;
5963 ASSERT_NETISR_NCPUS(mycpuid);
5965 for (i = 0; i < ipfw_table_max; ++i) {
5966 if (ctx->ipfw_tables[i] != NULL)
5967 ipfw_table_flush_oncpu(ctx, i, destroy);
5971 static void
5972 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5974 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5976 ASSERT_NETISR_NCPUS(mycpuid);
5978 ipfw_table_flushall_oncpu(ctx, 0);
5979 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5982 static int
5983 ipfw_table_flush(struct sockopt *sopt)
5985 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5986 struct ipfw_ioc_table *tbl;
5987 struct netmsg_tblflush nm;
5989 ASSERT_NETISR0;
5991 if (sopt->sopt_valsize != sizeof(*tbl))
5992 return (EINVAL);
5994 tbl = sopt->sopt_val;
5995 if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5996 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5997 MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5998 netisr_domsg_global(&nm.base);
5999 return (0);
6002 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6003 return (EINVAL);
6005 if (ctx->ipfw_tables[tbl->tableid] == NULL)
6006 return (ENOENT);
6008 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6009 ipfw_table_flush_dispatch);
6010 nm.tableid = tbl->tableid;
6011 nm.destroy = 0;
6012 if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6013 nm.destroy = 1;
6014 netisr_domsg_global(&nm.base);
6016 return (0);
6019 static int
6020 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6022 int *cnt = xcnt;
6024 (*cnt)++;
6025 return (0);
6028 static int
6029 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6031 struct ipfw_table_cp *cp = xcp;
6032 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6033 struct ipfw_ioc_tblent *ioc_te;
6034 #ifdef INVARIANTS
6035 int cnt;
6036 #endif
6038 KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6039 cp->te_idx, cp->te_cnt));
6040 ioc_te = &cp->te[cp->te_idx];
6042 if (te->te_nodes->rn_mask != NULL) {
6043 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6044 *te->te_nodes->rn_mask);
6045 } else {
6046 ioc_te->netmask.sin_len = 0;
6048 memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6050 ioc_te->use = te->te_use;
6051 ioc_te->last_used = te->te_lastuse;
6052 #ifdef INVARIANTS
6053 cnt = 1;
6054 #endif
6056 while ((te = te->te_sibling) != NULL) {
6057 #ifdef INVARIANTS
6058 ++cnt;
6059 #endif
6060 ioc_te->use += te->te_use;
6061 if (te->te_lastuse > ioc_te->last_used)
6062 ioc_te->last_used = te->te_lastuse;
6064 KASSERT(cnt == netisr_ncpus,
6065 ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6067 cp->te_idx++;
6069 return (0);
6072 static int
6073 ipfw_table_get(struct sockopt *sopt)
6075 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6076 struct radix_node_head *rnh;
6077 struct ipfw_ioc_table *tbl;
6078 struct ipfw_ioc_tblcont *cont;
6079 struct ipfw_table_cp cp;
6080 int cnt = 0, sz;
6082 ASSERT_NETISR0;
6084 if (sopt->sopt_valsize < sizeof(*tbl))
6085 return (EINVAL);
6087 tbl = sopt->sopt_val;
6088 if (tbl->tableid < 0) {
6089 struct ipfw_ioc_tbllist *list;
6090 int i;
6093 * List available table ids.
6095 for (i = 0; i < ipfw_table_max; ++i) {
6096 if (ctx->ipfw_tables[i] != NULL)
6097 ++cnt;
6100 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6101 if (sopt->sopt_valsize < sz) {
6102 bzero(sopt->sopt_val, sopt->sopt_valsize);
6103 return (E2BIG);
6105 list = sopt->sopt_val;
6106 list->tablecnt = cnt;
6108 cnt = 0;
6109 for (i = 0; i < ipfw_table_max; ++i) {
6110 if (ctx->ipfw_tables[i] != NULL) {
6111 KASSERT(cnt < list->tablecnt,
6112 ("invalid idx %d, cnt %d",
6113 cnt, list->tablecnt));
6114 list->tables[cnt++] = i;
6117 sopt->sopt_valsize = sz;
6118 return (0);
6119 } else if (tbl->tableid >= ipfw_table_max) {
6120 return (EINVAL);
6123 rnh = ctx->ipfw_tables[tbl->tableid];
6124 if (rnh == NULL)
6125 return (ENOENT);
6126 rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6128 sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6129 if (sopt->sopt_valsize < sz) {
6130 bzero(sopt->sopt_val, sopt->sopt_valsize);
6131 return (E2BIG);
6133 cont = sopt->sopt_val;
6134 cont->entcnt = cnt;
6136 cp.te = cont->ent;
6137 cp.te_idx = 0;
6138 cp.te_cnt = cnt;
6139 rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6141 sopt->sopt_valsize = sz;
6142 return (0);
6145 static void
6146 ipfw_table_add_dispatch(netmsg_t nmsg)
6148 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6149 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6150 struct radix_node_head *rnh;
6151 struct ipfw_tblent *te;
6153 ASSERT_NETISR_NCPUS(mycpuid);
6155 rnh = ctx->ipfw_tables[nm->tableid];
6157 te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6158 te->te_nodes->rn_key = (char *)&te->te_key;
6159 memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6161 if (rnh->rnh_addaddr(&te->te_key, nm->netmask, rnh, te->te_nodes)
6162 == NULL) {
6163 if (mycpuid == 0) {
6164 kfree(te, M_IPFW);
6165 netisr_replymsg(&nm->base, EEXIST);
6166 return;
6168 panic("rnh_addaddr failed");
6171 /* Link siblings. */
6172 if (nm->sibling != NULL)
6173 nm->sibling->te_sibling = te;
6174 nm->sibling = te;
6176 netisr_forwardmsg(&nm->base, mycpuid + 1);
6179 static void
6180 ipfw_table_del_dispatch(netmsg_t nmsg)
6182 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6183 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6184 struct radix_node_head *rnh;
6185 struct radix_node *rn;
6187 ASSERT_NETISR_NCPUS(mycpuid);
6189 rnh = ctx->ipfw_tables[nm->tableid];
6190 rn = rnh->rnh_deladdr(nm->key, nm->netmask, rnh);
6191 if (rn == NULL) {
6192 if (mycpuid == 0) {
6193 netisr_replymsg(&nm->base, ESRCH);
6194 return;
6196 panic("rnh_deladdr failed");
6198 kfree(rn, M_IPFW);
6200 netisr_forwardmsg(&nm->base, mycpuid + 1);
6203 static int
6204 ipfw_table_alt(struct sockopt *sopt)
6206 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6207 struct ipfw_ioc_tblcont *tbl;
6208 struct ipfw_ioc_tblent *te;
6209 struct sockaddr_in key0;
6210 struct sockaddr *netmask = NULL, *key;
6211 struct netmsg_tblent nm;
6213 ASSERT_NETISR0;
6215 if (sopt->sopt_valsize != sizeof(*tbl))
6216 return (EINVAL);
6217 tbl = sopt->sopt_val;
6219 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6220 return (EINVAL);
6221 if (tbl->entcnt != 1)
6222 return (EINVAL);
6224 if (ctx->ipfw_tables[tbl->tableid] == NULL)
6225 return (ENOENT);
6226 te = &tbl->ent[0];
6228 if (te->key.sin_family != AF_INET ||
6229 te->key.sin_port != 0 ||
6230 te->key.sin_len != sizeof(struct sockaddr_in))
6231 return (EINVAL);
6232 key = (struct sockaddr *)&te->key;
6234 if (te->netmask.sin_len != 0) {
6235 if (te->netmask.sin_port != 0 ||
6236 te->netmask.sin_len > sizeof(struct sockaddr_in))
6237 return (EINVAL);
6238 netmask = (struct sockaddr *)&te->netmask;
6239 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6240 key = (struct sockaddr *)&key0;
6243 if (sopt->sopt_name == IP_FW_TBL_ADD) {
6244 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6245 MSGF_PRIORITY, ipfw_table_add_dispatch);
6246 } else {
6247 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6248 MSGF_PRIORITY, ipfw_table_del_dispatch);
6250 nm.key = key;
6251 nm.netmask = netmask;
6252 nm.tableid = tbl->tableid;
6253 nm.sibling = NULL;
6254 return (netisr_domsg_global(&nm.base));
6257 static int
6258 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6260 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6262 te->te_use = 0;
6263 te->te_lastuse = 0;
6264 return (0);
6267 static void
6268 ipfw_table_zero_dispatch(netmsg_t nmsg)
6270 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6271 struct radix_node_head *rnh;
6273 ASSERT_NETISR_NCPUS(mycpuid);
6275 rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6276 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6278 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6281 static void
6282 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6284 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6285 int i;
6287 ASSERT_NETISR_NCPUS(mycpuid);
6289 for (i = 0; i < ipfw_table_max; ++i) {
6290 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6292 if (rnh != NULL)
6293 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6295 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6298 static int
6299 ipfw_table_zero(struct sockopt *sopt)
6301 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6302 struct netmsg_base nm;
6303 struct ipfw_ioc_table *tbl;
6305 ASSERT_NETISR0;
6307 if (sopt->sopt_valsize != sizeof(*tbl))
6308 return (EINVAL);
6309 tbl = sopt->sopt_val;
6311 if (tbl->tableid < 0) {
6312 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6313 ipfw_table_zeroall_dispatch);
6314 netisr_domsg_global(&nm);
6315 return (0);
6316 } else if (tbl->tableid >= ipfw_table_max) {
6317 return (EINVAL);
6318 } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6319 return (ENOENT);
6322 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6323 ipfw_table_zero_dispatch);
6324 nm.lmsg.u.ms_result = tbl->tableid;
6325 netisr_domsg_global(&nm);
6327 return (0);
6330 static int
6331 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6333 struct netmsg_tblexp *nm = xnm;
6334 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6335 struct radix_node *ret;
6337 if (te->te_expired) {
6338 ret = nm->rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, nm->rnh);
6339 if (ret != rn)
6340 panic("deleted other table entry");
6341 kfree(ret, M_IPFW);
6342 nm->expcnt++;
6344 return (0);
6347 static void
6348 ipfw_table_expire_dispatch(netmsg_t nmsg)
6350 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6351 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6352 struct radix_node_head *rnh;
6354 ASSERT_NETISR_NCPUS(mycpuid);
6356 rnh = ctx->ipfw_tables[nm->tableid];
6357 nm->rnh = rnh;
6358 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6360 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6361 ("not all expired addresses (%d) were deleted (%d)",
6362 nm->cnt * (mycpuid + 1), nm->expcnt));
6364 netisr_forwardmsg(&nm->base, mycpuid + 1);
6367 static void
6368 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6370 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6371 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6372 int i;
6374 ASSERT_NETISR_NCPUS(mycpuid);
6376 for (i = 0; i < ipfw_table_max; ++i) {
6377 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6379 if (rnh == NULL)
6380 continue;
6381 nm->rnh = rnh;
6382 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6385 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6386 ("not all expired addresses (%d) were deleted (%d)",
6387 nm->cnt * (mycpuid + 1), nm->expcnt));
6389 netisr_forwardmsg(&nm->base, mycpuid + 1);
6392 static int
6393 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6395 struct netmsg_tblexp *nm = xnm;
6396 struct ipfw_tblent *te;
6397 time_t lastuse;
6399 te = (struct ipfw_tblent *)rn;
6400 lastuse = te->te_lastuse;
6402 while ((te = te->te_sibling) != NULL) {
6403 if (te->te_lastuse > lastuse)
6404 lastuse = te->te_lastuse;
6406 if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6407 /* Not expired */
6408 return (0);
6411 te = (struct ipfw_tblent *)rn;
6412 te->te_expired = 1;
6413 while ((te = te->te_sibling) != NULL)
6414 te->te_expired = 1;
6415 nm->cnt++;
6417 return (0);
6420 static int
6421 ipfw_table_expire(struct sockopt *sopt)
6423 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6424 struct netmsg_tblexp nm;
6425 struct ipfw_ioc_tblexp *tbl;
6426 struct radix_node_head *rnh;
6428 ASSERT_NETISR0;
6430 if (sopt->sopt_valsize != sizeof(*tbl))
6431 return (EINVAL);
6432 tbl = sopt->sopt_val;
6433 tbl->expcnt = 0;
6435 nm.expcnt = 0;
6436 nm.cnt = 0;
6437 nm.expire = tbl->expire;
6439 if (tbl->tableid < 0) {
6440 int i;
6442 for (i = 0; i < ipfw_table_max; ++i) {
6443 rnh = ctx->ipfw_tables[i];
6444 if (rnh == NULL)
6445 continue;
6446 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6448 if (nm.cnt == 0) {
6449 /* No addresses can be expired. */
6450 return (0);
6452 tbl->expcnt = nm.cnt;
6454 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6455 MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6456 nm.tableid = -1;
6457 netisr_domsg_global(&nm.base);
6458 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6459 ("not all expired addresses (%d) were deleted (%d)",
6460 nm.cnt * netisr_ncpus, nm.expcnt));
6462 return (0);
6463 } else if (tbl->tableid >= ipfw_table_max) {
6464 return (EINVAL);
6467 rnh = ctx->ipfw_tables[tbl->tableid];
6468 if (rnh == NULL)
6469 return (ENOENT);
6470 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6471 if (nm.cnt == 0) {
6472 /* No addresses can be expired. */
6473 return (0);
6475 tbl->expcnt = nm.cnt;
6477 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6478 ipfw_table_expire_dispatch);
6479 nm.tableid = tbl->tableid;
6480 netisr_domsg_global(&nm.base);
6481 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6482 ("not all expired addresses (%d) were deleted (%d)",
6483 nm.cnt * netisr_ncpus, nm.expcnt));
6484 return (0);
6487 static void
6488 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6490 struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6492 KKASSERT((rule->rule_flags &
6493 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6494 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6495 ipfw_free_rule(rule);
6497 netisr_replymsg(&nmsg->base, 0);
6500 static void
6501 ipfw_crossref_reap(void)
6503 struct ip_fw *rule, *prev = NULL;
6505 ASSERT_NETISR0;
6507 rule = ipfw_gd.ipfw_crossref_free;
6508 while (rule != NULL) {
6509 uint64_t inflight = 0;
6510 int i;
6512 for (i = 0; i < netisr_ncpus; ++i)
6513 inflight += rule->cross_rules[i]->cross_refs;
6514 if (inflight == 0) {
6515 struct ip_fw *f = rule;
6518 * Unlink.
6520 rule = rule->next;
6521 if (prev != NULL)
6522 prev->next = rule;
6523 else
6524 ipfw_gd.ipfw_crossref_free = rule;
6527 * Free.
6529 for (i = 1; i < netisr_ncpus; ++i) {
6530 struct netmsg_base nm;
6532 netmsg_init(&nm, NULL, &curthread->td_msgport,
6533 MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6534 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6535 netisr_domsg(&nm, i);
6537 KKASSERT((f->rule_flags &
6538 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6539 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6540 ipfw_unref_rule(f);
6541 } else {
6542 prev = rule;
6543 rule = rule->next;
6547 if (ipfw_gd.ipfw_crossref_free != NULL) {
6548 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6549 ipfw_crossref_timeo, NULL);
6554 * {set|get}sockopt parser.
6556 static int
6557 ipfw_ctl(struct sockopt *sopt)
6559 int error, rulenum;
6560 uint32_t *masks;
6561 size_t size;
6563 ASSERT_NETISR0;
6565 error = 0;
6567 switch (sopt->sopt_name) {
6568 case IP_FW_GET:
6569 error = ipfw_ctl_get_rules(sopt);
6570 break;
6572 case IP_FW_FLUSH:
6573 ipfw_flush(0 /* keep default rule */);
6574 break;
6576 case IP_FW_ADD:
6577 error = ipfw_ctl_add_rule(sopt);
6578 break;
6580 case IP_FW_DEL:
6582 * IP_FW_DEL is used for deleting single rules or sets,
6583 * and (ab)used to atomically manipulate sets.
6584 * Argument size is used to distinguish between the two:
6585 * sizeof(uint32_t)
6586 * delete single rule or set of rules,
6587 * or reassign rules (or sets) to a different set.
6588 * 2 * sizeof(uint32_t)
6589 * atomic disable/enable sets.
6590 * first uint32_t contains sets to be disabled,
6591 * second uint32_t contains sets to be enabled.
6593 masks = sopt->sopt_val;
6594 size = sopt->sopt_valsize;
6595 if (size == sizeof(*masks)) {
6597 * Delete or reassign static rule
6599 error = ipfw_ctl_alter(masks[0]);
6600 } else if (size == (2 * sizeof(*masks))) {
6602 * Set enable/disable
6604 ipfw_ctl_set_disable(masks[0], masks[1]);
6605 } else {
6606 error = EINVAL;
6608 break;
6610 case IP_FW_ZERO:
6611 case IP_FW_RESETLOG: /* argument is an int, the rule number */
6612 rulenum = 0;
6614 if (sopt->sopt_val != 0) {
6615 error = soopt_to_kbuf(sopt, &rulenum,
6616 sizeof(int), sizeof(int));
6617 if (error)
6618 break;
6620 error = ipfw_ctl_zero_entry(rulenum,
6621 sopt->sopt_name == IP_FW_RESETLOG);
6622 break;
6624 case IP_FW_TBL_CREATE:
6625 error = ipfw_table_create(sopt);
6626 break;
6628 case IP_FW_TBL_ADD:
6629 case IP_FW_TBL_DEL:
6630 error = ipfw_table_alt(sopt);
6631 break;
6633 case IP_FW_TBL_FLUSH:
6634 case IP_FW_TBL_DESTROY:
6635 error = ipfw_table_flush(sopt);
6636 break;
6638 case IP_FW_TBL_GET:
6639 error = ipfw_table_get(sopt);
6640 break;
6642 case IP_FW_TBL_ZERO:
6643 error = ipfw_table_zero(sopt);
6644 break;
6646 case IP_FW_TBL_EXPIRE:
6647 error = ipfw_table_expire(sopt);
6648 break;
6650 default:
6651 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6652 error = EINVAL;
6655 ipfw_crossref_reap();
6656 return error;
6659 static void
6660 ipfw_keepalive_done(struct ipfw_context *ctx)
6663 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6664 ("keepalive is not in progress"));
6665 ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6666 callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6667 ipfw_keepalive, NULL);
6670 static void
6671 ipfw_keepalive_more(struct ipfw_context *ctx)
6673 struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6675 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6676 ("keepalive is not in progress"));
6677 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6678 ("keepalive more did not finish"));
6679 netisr_sendmsg_oncpu(nm);
6682 static void
6683 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6685 struct ipfw_state *s;
6686 int scanned = 0, expired = 0, kept = 0;
6688 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6689 ("keepalive is not in progress"));
6691 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6692 uint32_t ack_rev, ack_fwd;
6693 struct ipfw_flow_id id;
6694 uint8_t send_dir;
6696 if (scanned++ >= ipfw_state_scan_max) {
6697 ipfw_keepalive_more(ctx);
6698 return;
6701 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6702 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6705 * NOTE:
6706 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6707 * on slave xlat.
6709 if (s->st_type == O_ANCHOR)
6710 continue;
6712 if (IPFW_STATE_ISDEAD(s)) {
6713 ipfw_state_remove(ctx, s);
6714 if (++expired >= ipfw_state_expire_max) {
6715 ipfw_keepalive_more(ctx);
6716 return;
6718 continue;
6722 * Keep alive processing
6725 if (s->st_proto != IPPROTO_TCP)
6726 continue;
6727 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6728 continue;
6729 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6730 s->st_expire))
6731 continue; /* too early */
6733 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6734 &id.dst_ip, &id.dst_port);
6735 ack_rev = s->st_ack_rev;
6736 ack_fwd = s->st_ack_fwd;
6738 #define SEND_FWD 0x1
6739 #define SEND_REV 0x2
6741 if (IPFW_ISXLAT(s->st_type)) {
6742 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6744 if (x->xlat_dir == MATCH_FORWARD)
6745 send_dir = SEND_FWD;
6746 else
6747 send_dir = SEND_REV;
6748 } else {
6749 send_dir = SEND_FWD | SEND_REV;
6752 if (send_dir & SEND_REV)
6753 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6754 if (send_dir & SEND_FWD)
6755 send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6757 #undef SEND_FWD
6758 #undef SEND_REV
6760 if (++kept >= ipfw_keepalive_max) {
6761 ipfw_keepalive_more(ctx);
6762 return;
6765 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6766 ipfw_keepalive_done(ctx);
6769 static void
6770 ipfw_keepalive_more_dispatch(netmsg_t nm)
6772 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6773 struct ipfw_state *anchor;
6775 ASSERT_NETISR_NCPUS(mycpuid);
6776 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6777 ("keepalive is not in progress"));
6779 /* Reply ASAP */
6780 netisr_replymsg(&nm->base, 0);
6782 anchor = &ctx->ipfw_keepalive_anch;
6783 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6784 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6785 ipfw_keepalive_done(ctx);
6786 return;
6788 ipfw_keepalive_loop(ctx, anchor);
6792 * This procedure is only used to handle keepalives. It is invoked
6793 * every dyn_keepalive_period
6795 static void
6796 ipfw_keepalive_dispatch(netmsg_t nm)
6798 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6799 struct ipfw_state *anchor;
6801 ASSERT_NETISR_NCPUS(mycpuid);
6802 KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6803 ("keepalive is in progress"));
6804 ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6806 /* Reply ASAP */
6807 crit_enter();
6808 netisr_replymsg(&nm->base, 0);
6809 crit_exit();
6811 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6812 ipfw_keepalive_done(ctx);
6813 return;
6816 anchor = &ctx->ipfw_keepalive_anch;
6817 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6818 ipfw_keepalive_loop(ctx, anchor);
6822 * This procedure is only used to handle keepalives. It is invoked
6823 * every dyn_keepalive_period
6825 static void
6826 ipfw_keepalive(void *dummy __unused)
6828 struct netmsg_base *msg;
6830 KKASSERT(mycpuid < netisr_ncpus);
6831 msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6833 crit_enter();
6834 if (msg->lmsg.ms_flags & MSGF_DONE)
6835 netisr_sendmsg_oncpu(msg);
6836 crit_exit();
6839 static void
6840 ipfw_ip_input_dispatch(netmsg_t nmsg)
6842 struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6843 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6844 struct mbuf *m = nm->m;
6845 struct ip_fw *rule = nm->arg1;
6847 ASSERT_NETISR_NCPUS(mycpuid);
6848 KASSERT(rule->cpuid == mycpuid,
6849 ("rule does not belong to cpu%d", mycpuid));
6850 KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6851 ("mbuf does not have ipfw continue rule"));
6853 KASSERT(ctx->ipfw_cont_rule == NULL,
6854 ("pending ipfw continue rule"));
6855 ctx->ipfw_cont_rule = rule;
6856 ip_input(m);
6858 /* May not be cleared, if ipfw was unload/disabled. */
6859 ctx->ipfw_cont_rule = NULL;
6862 * This rule is no longer used; decrement its cross_refs,
6863 * so this rule can be deleted.
6865 rule->cross_refs--;
6868 static void
6869 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6871 struct netmsg_genpkt *nm;
6873 KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6876 * NOTE:
6877 * Bump cross_refs to prevent this rule and its siblings
6878 * from being deleted, while this mbuf is inflight. The
6879 * cross_refs of the sibling rule on the target cpu will
6880 * be decremented, once this mbuf is going to be filtered
6881 * on the target cpu.
6883 rule->cross_refs++;
6884 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6886 nm = &m->m_hdr.mh_genmsg;
6887 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6888 ipfw_ip_input_dispatch);
6889 nm->m = m;
6890 nm->arg1 = rule->cross_rules[cpuid];
6891 netisr_sendmsg(&nm->base, cpuid);
6894 static void
6895 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6898 args->flags = 0;
6899 args->rule = NULL;
6900 args->xlat = NULL;
6902 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6903 struct m_tag *mtag;
6905 /* Extract info from dummynet tag */
6906 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6907 KKASSERT(mtag != NULL);
6908 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6909 KKASSERT(args->rule != NULL);
6911 m_tag_delete(m, mtag);
6912 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6913 } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6914 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6916 KKASSERT(ctx->ipfw_cont_rule != NULL);
6917 args->rule = ctx->ipfw_cont_rule;
6918 ctx->ipfw_cont_rule = NULL;
6920 if (ctx->ipfw_cont_xlat != NULL) {
6921 args->xlat = ctx->ipfw_cont_xlat;
6922 ctx->ipfw_cont_xlat = NULL;
6923 if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6924 args->flags |= IP_FWARG_F_XLATINS;
6925 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6927 if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6928 args->flags |= IP_FWARG_F_XLATFWD;
6929 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6932 KKASSERT((m->m_pkthdr.fw_flags &
6933 (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6935 args->flags |= IP_FWARG_F_CONT;
6936 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6939 args->eh = NULL;
6940 args->oif = oif;
6941 args->m = m;
6944 static int
6945 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6947 struct ip_fw_args args;
6948 struct mbuf *m = *m0;
6949 int tee = 0, error = 0, ret;
6951 ipfw_init_args(&args, m, NULL);
6953 ret = ipfw_chk(&args);
6954 m = args.m;
6955 if (m == NULL) {
6956 if (ret != IP_FW_REDISPATCH)
6957 error = EACCES;
6958 goto back;
6961 switch (ret) {
6962 case IP_FW_PASS:
6963 break;
6965 case IP_FW_DENY:
6966 m_freem(m);
6967 m = NULL;
6968 error = EACCES;
6969 break;
6971 case IP_FW_DUMMYNET:
6972 /* Send packet to the appropriate pipe */
6973 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6974 break;
6976 case IP_FW_TEE:
6977 tee = 1;
6978 /* FALL THROUGH */
6980 case IP_FW_DIVERT:
6982 * Must clear bridge tag when changing
6984 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6985 if (ip_divert_p != NULL) {
6986 m = ip_divert_p(m, tee, 1);
6987 } else {
6988 m_freem(m);
6989 m = NULL;
6990 /* not sure this is the right error msg */
6991 error = EACCES;
6993 break;
6995 default:
6996 panic("unknown ipfw return value: %d", ret);
6998 back:
6999 *m0 = m;
7000 return error;
7003 static int
7004 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7006 struct ip_fw_args args;
7007 struct mbuf *m = *m0;
7008 int tee = 0, error = 0, ret;
7010 ipfw_init_args(&args, m, ifp);
7012 ret = ipfw_chk(&args);
7013 m = args.m;
7014 if (m == NULL) {
7015 if (ret != IP_FW_REDISPATCH)
7016 error = EACCES;
7017 goto back;
7020 switch (ret) {
7021 case IP_FW_PASS:
7022 break;
7024 case IP_FW_DENY:
7025 m_freem(m);
7026 m = NULL;
7027 error = EACCES;
7028 break;
7030 case IP_FW_DUMMYNET:
7031 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7032 break;
7034 case IP_FW_TEE:
7035 tee = 1;
7036 /* FALL THROUGH */
7038 case IP_FW_DIVERT:
7039 if (ip_divert_p != NULL) {
7040 m = ip_divert_p(m, tee, 0);
7041 } else {
7042 m_freem(m);
7043 m = NULL;
7044 /* not sure this is the right error msg */
7045 error = EACCES;
7047 break;
7049 default:
7050 panic("unknown ipfw return value: %d", ret);
7052 back:
7053 *m0 = m;
7054 return error;
7057 static void
7058 ipfw_hook(void)
7060 struct pfil_head *pfh;
7062 ASSERT_NETISR0;
7064 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7065 if (pfh == NULL)
7066 return;
7068 pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7069 pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7072 static void
7073 ipfw_dehook(void)
7075 struct pfil_head *pfh;
7077 ASSERT_NETISR0;
7079 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7080 if (pfh == NULL)
7081 return;
7083 pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7084 pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7087 static int
7088 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7090 int dyn_cnt;
7092 dyn_cnt = ipfw_state_cntcoll();
7093 dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7095 return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7098 static int
7099 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7101 int state_cnt;
7103 state_cnt = ipfw_state_cntcoll();
7104 return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7107 static int
7108 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7110 int state_max, error;
7112 state_max = ipfw_state_max;
7113 error = sysctl_handle_int(oidp, &state_max, 0, req);
7114 if (error || req->newptr == NULL)
7115 return (error);
7117 if (state_max < 1)
7118 return (EINVAL);
7120 ipfw_state_max_set(state_max);
7121 return (0);
7124 static int
7125 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7127 int dyn_max, error;
7129 dyn_max = ipfw_state_max + ipfw_track_max;
7131 error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7132 if (error || req->newptr == NULL)
7133 return (error);
7135 if (dyn_max < 2)
7136 return (EINVAL);
7138 ipfw_state_max_set(dyn_max / 2);
7139 ipfw_track_max = dyn_max / 2;
7140 return (0);
7143 static void
7144 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7146 int enable = nmsg->lmsg.u.ms_result;
7148 ASSERT_NETISR0;
7150 if (fw_enable == enable)
7151 goto reply;
7153 fw_enable = enable;
7154 if (fw_enable)
7155 ipfw_hook();
7156 else
7157 ipfw_dehook();
7158 reply:
7159 netisr_replymsg(&nmsg->base, 0);
7162 static int
7163 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7165 struct netmsg_base nmsg;
7166 int enable, error;
7168 enable = fw_enable;
7169 error = sysctl_handle_int(oidp, &enable, 0, req);
7170 if (error || req->newptr == NULL)
7171 return error;
7173 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7174 ipfw_sysctl_enable_dispatch);
7175 nmsg.lmsg.u.ms_result = enable;
7177 return netisr_domsg(&nmsg, 0);
7180 static int
7181 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7183 return sysctl_int_range(oidp, arg1, arg2, req,
7184 IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7187 static int
7188 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7191 return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7194 static int
7195 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7197 u_long stat = 0;
7198 int cpu, error;
7200 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7201 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7203 error = sysctl_handle_long(oidp, &stat, 0, req);
7204 if (error || req->newptr == NULL)
7205 return (error);
7207 /* Zero out this stat. */
7208 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7209 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7210 return (0);
7213 static void
7214 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7216 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7217 struct ipfw_context *ctx;
7218 struct ip_fw *def_rule;
7220 ASSERT_NETISR_NCPUS(mycpuid);
7222 ctx = kmalloc(__offsetof(struct ipfw_context,
7223 ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7225 RB_INIT(&ctx->ipfw_state_tree);
7226 TAILQ_INIT(&ctx->ipfw_state_list);
7228 RB_INIT(&ctx->ipfw_track_tree);
7229 TAILQ_INIT(&ctx->ipfw_track_list);
7231 callout_init_mp(&ctx->ipfw_stateto_ch);
7232 netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7233 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7234 ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7235 netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7236 MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7238 callout_init_mp(&ctx->ipfw_trackto_ch);
7239 netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7240 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7241 netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7242 MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7244 callout_init_mp(&ctx->ipfw_keepalive_ch);
7245 netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7246 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7247 ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7248 netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7249 MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7251 callout_init_mp(&ctx->ipfw_xlatreap_ch);
7252 netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7253 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7254 TAILQ_INIT(&ctx->ipfw_xlatreap);
7256 ipfw_ctx[mycpuid] = ctx;
7258 def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7260 def_rule->act_ofs = 0;
7261 def_rule->rulenum = IPFW_DEFAULT_RULE;
7262 def_rule->cmd_len = 1;
7263 def_rule->set = IPFW_DEFAULT_SET;
7265 def_rule->cmd[0].len = 1;
7266 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7267 def_rule->cmd[0].opcode = O_ACCEPT;
7268 #else
7269 if (filters_default_to_accept)
7270 def_rule->cmd[0].opcode = O_ACCEPT;
7271 else
7272 def_rule->cmd[0].opcode = O_DENY;
7273 #endif
7275 def_rule->refcnt = 1;
7276 def_rule->cpuid = mycpuid;
7278 /* Install the default rule */
7279 ctx->ipfw_default_rule = def_rule;
7280 ctx->ipfw_layer3_chain = def_rule;
7282 /* Link rule CPU sibling */
7283 ipfw_link_sibling(fwmsg, def_rule);
7285 /* Statistics only need to be updated once */
7286 if (mycpuid == 0)
7287 ipfw_inc_static_count(def_rule);
7289 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7292 static void
7293 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7296 crit_enter();
7297 /* Reply ASAP */
7298 netisr_replymsg(&nmsg->base, 0);
7299 crit_exit();
7300 ipfw_crossref_reap();
7303 static void
7304 ipfw_crossref_timeo(void *dummy __unused)
7306 struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7308 KKASSERT(mycpuid == 0);
7310 crit_enter();
7311 if (msg->lmsg.ms_flags & MSGF_DONE)
7312 netisr_sendmsg_oncpu(msg);
7313 crit_exit();
7316 static void
7317 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7319 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7320 struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7321 struct ip_fw *f;
7323 ASSERT_NETISR_NCPUS(mycpuid);
7325 for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7326 int l, cmdlen;
7327 ipfw_insn *cmd;
7329 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7330 continue;
7332 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7333 l -= cmdlen, cmd += cmdlen) {
7334 cmdlen = F_LEN(cmd);
7335 if (cmd->opcode == O_IP_SRC_IFIP ||
7336 cmd->opcode == O_IP_DST_IFIP) {
7337 if (strncmp(ifp->if_xname,
7338 ((ipfw_insn_ifip *)cmd)->ifname,
7339 IFNAMSIZ) == 0)
7340 cmd->arg1 &= ~IPFW_IFIP_VALID;
7344 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7347 static void
7348 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7349 enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7351 struct netmsg_base nm;
7353 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7354 ipfw_ifaddr_dispatch);
7355 nm.lmsg.u.ms_resultp = ifp;
7356 netisr_domsg_global(&nm);
7359 static void
7360 ipfw_init_dispatch(netmsg_t nmsg)
7362 struct netmsg_ipfw fwmsg;
7363 int error = 0, cpu;
7365 ASSERT_NETISR0;
7367 if (IPFW_LOADED) {
7368 kprintf("IP firewall already loaded\n");
7369 error = EEXIST;
7370 goto reply;
7373 if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7374 ipfw_table_max = UINT16_MAX;
7376 /* Initialize global track tree. */
7377 RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7378 IPFW_TRKCNT_TOKINIT;
7380 /* GC for freed crossref rules. */
7381 callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7382 netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7383 MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7385 ipfw_state_max_set(ipfw_state_max);
7386 ipfw_state_headroom = 8 * netisr_ncpus;
7388 bzero(&fwmsg, sizeof(fwmsg));
7389 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7390 ipfw_ctx_init_dispatch);
7391 netisr_domsg_global(&fwmsg.base);
7393 ip_fw_chk_ptr = ipfw_chk;
7394 ip_fw_ctl_ptr = ipfw_ctl;
7395 ip_fw_dn_io_ptr = ipfw_dummynet_io;
7397 kprintf("ipfw2 initialized, default to %s, logging ",
7398 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7399 O_ACCEPT ? "accept" : "deny");
7401 #ifdef IPFIREWALL_VERBOSE
7402 fw_verbose = 1;
7403 #endif
7404 #ifdef IPFIREWALL_VERBOSE_LIMIT
7405 verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7406 #endif
7407 if (fw_verbose == 0) {
7408 kprintf("disabled\n");
7409 } else if (verbose_limit == 0) {
7410 kprintf("unlimited\n");
7411 } else {
7412 kprintf("limited to %d packets/entry by default\n",
7413 verbose_limit);
7416 ip_fw_loaded = 1;
7417 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7418 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7419 ipfw_state_expire_ipifunc, NULL, cpu);
7420 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7421 ipfw_track_expire_ipifunc, NULL, cpu);
7422 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7423 ipfw_keepalive, NULL, cpu);
7426 if (fw_enable)
7427 ipfw_hook();
7429 ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7430 NULL, EVENTHANDLER_PRI_ANY);
7431 if (ipfw_ifaddr_event == NULL)
7432 kprintf("ipfw: ifaddr_event register failed\n");
7434 reply:
7435 netisr_replymsg(&nmsg->base, error);
7438 static int
7439 ipfw_init(void)
7441 struct netmsg_base smsg;
7443 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7444 ipfw_init_dispatch);
7445 return netisr_domsg(&smsg, 0);
7448 #ifdef KLD_MODULE
7450 static void
7451 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7453 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7455 ASSERT_NETISR_NCPUS(mycpuid);
7457 callout_cancel(&ctx->ipfw_stateto_ch);
7458 callout_cancel(&ctx->ipfw_trackto_ch);
7459 callout_cancel(&ctx->ipfw_keepalive_ch);
7460 callout_cancel(&ctx->ipfw_xlatreap_ch);
7462 crit_enter();
7463 netisr_dropmsg(&ctx->ipfw_stateexp_more);
7464 netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7465 netisr_dropmsg(&ctx->ipfw_trackexp_more);
7466 netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7467 netisr_dropmsg(&ctx->ipfw_keepalive_more);
7468 netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7469 netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7470 crit_exit();
7472 ipfw_table_flushall_oncpu(ctx, 1);
7474 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7477 static void
7478 ipfw_fini_dispatch(netmsg_t nmsg)
7480 struct netmsg_base nm;
7481 int error = 0, cpu;
7483 ASSERT_NETISR0;
7485 ipfw_crossref_reap();
7487 if (ipfw_gd.ipfw_refcnt != 0) {
7488 error = EBUSY;
7489 goto reply;
7492 ip_fw_loaded = 0;
7493 ipfw_dehook();
7495 /* Synchronize any inflight state/track expire IPIs. */
7496 lwkt_synchronize_ipiqs("ipfwfini");
7498 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7499 ipfw_ctx_fini_dispatch);
7500 netisr_domsg_global(&nm);
7502 callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7503 crit_enter();
7504 netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7505 crit_exit();
7507 if (ipfw_ifaddr_event != NULL)
7508 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7510 ip_fw_chk_ptr = NULL;
7511 ip_fw_ctl_ptr = NULL;
7512 ip_fw_dn_io_ptr = NULL;
7513 ipfw_flush(1 /* kill default rule */);
7515 /* Free pre-cpu context */
7516 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7517 kfree(ipfw_ctx[cpu], M_IPFW);
7519 kprintf("IP firewall unloaded\n");
7520 reply:
7521 netisr_replymsg(&nmsg->base, error);
7524 static void
7525 ipfw_fflush_dispatch(netmsg_t nmsg)
7528 ipfw_flush(0 /* keep default rule */);
7529 ipfw_crossref_reap();
7530 netisr_replymsg(&nmsg->base, 0);
7533 static int
7534 ipfw_fini(void)
7536 struct netmsg_base smsg;
7537 int i = 0;
7539 for (;;) {
7540 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7541 ipfw_fflush_dispatch);
7542 netisr_domsg(&smsg, 0);
7544 if (ipfw_gd.ipfw_refcnt == 0)
7545 break;
7546 kprintf("ipfw: flush pending %d\n", ++i);
7547 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7550 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7551 ipfw_fini_dispatch);
7552 return netisr_domsg(&smsg, 0);
7555 #endif /* KLD_MODULE */
7557 static int
7558 ipfw_modevent(module_t mod, int type, void *unused)
7560 int err = 0;
7562 switch (type) {
7563 case MOD_LOAD:
7564 err = ipfw_init();
7565 break;
7567 case MOD_UNLOAD:
7568 #ifndef KLD_MODULE
7569 kprintf("ipfw statically compiled, cannot unload\n");
7570 err = EBUSY;
7571 #else
7572 err = ipfw_fini();
7573 #endif
7574 break;
7575 default:
7576 break;
7578 return err;
7581 static moduledata_t ipfwmod = {
7582 "ipfw",
7583 ipfw_modevent,
7586 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7587 MODULE_VERSION(ipfw, 1);