kernel - Rewrite the callout_*() API
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
blobd6634ee67f8c3092f1e3411ff0d9f7eabeef9665
1 /*
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
25 * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
29 * Implement IP packet firewall (new version)
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
80 #include <net/ipfw/ip_fw2.h>
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85 if (fw_debug > 0) \
86 kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...) ((void)0)
90 #endif
93 * Description about per-CPU rule duplication:
95 * Module loading/unloading and all ioctl operations are serialized
96 * by netisr0, so we don't have any ordering or locking problems.
98 * Following graph shows how operation on per-CPU rule list is
99 * performed [2 CPU case]:
101 * CPU0 CPU1
103 * netisr0 <------------------------------------+
104 * domsg |
105 * : |
106 * :(delete/add...) |
107 * : |
108 * : netmsg | netmsg
109 * forwardmsg---------->netisr1 |
110 * : |
111 * :(delete/add...) |
112 * : |
113 * : |
114 * replymsg--------------+
118 * Rule structure [2 CPU case]
120 * CPU0 CPU1
122 * layer3_chain layer3_chain
123 * | |
124 * V V
125 * +-------+ sibling +-------+ sibling
126 * | rule1 |--------->| rule1 |--------->NULL
127 * +-------+ +-------+
128 * | |
129 * |next |next
130 * V V
131 * +-------+ sibling +-------+ sibling
132 * | rule2 |--------->| rule2 |--------->NULL
133 * +-------+ +-------+
135 * ip_fw.sibling:
136 * 1) Ease statistics calculation during IP_FW_GET. We only need to
137 * iterate layer3_chain in netisr0; the current rule's duplication
138 * to the other CPUs could safely be read-only accessed through
139 * ip_fw.sibling.
140 * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141 * a) In netisr0 rule3 is determined to be inserted between rule1
142 * and rule2. To make this decision we need to iterate the
143 * layer3_chain in netisr0. The netmsg, which is used to insert
144 * the rule, will contain rule1 in netisr0 as prev_rule and rule2
145 * in netisr0 as next_rule.
146 * b) After the insertion in netisr0 is done, we will move on to
147 * netisr1. But instead of relocating the rule3's position in
148 * netisr1 by iterating the layer3_chain in netisr1, we set the
149 * netmsg's prev_rule to rule1->sibling and next_rule to
150 * rule2->sibling before the netmsg is forwarded to netisr1 from
151 * netisr0.
155 * Description of states and tracks.
157 * Both states and tracks are stored in per-cpu RB trees instead of
158 * per-cpu hash tables to avoid the worst case hash degeneration.
160 * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161 * measured in seconds and depending on the flags.
163 * When a packet is received, its address fields are first masked with
164 * the mask defined for the rule, then matched against the entries in
165 * the per-cpu state RB tree. States are generated by 'keep-state'
166 * and 'limit' options.
168 * The max number of states is ipfw_state_max. When we reach the
169 * maximum number of states we do not create anymore. This is done to
170 * avoid consuming too much memory, but also too much time when
171 * searching on each packet.
173 * Each state holds a pointer to the parent ipfw rule of the current
174 * CPU so we know what action to perform. States are removed when the
175 * parent rule is deleted. XXX we should make them survive.
177 * There are some limitations with states -- we do not obey the
178 * 'randomized match', and we do not do multiple passes through the
179 * firewall. XXX check the latter!!!
181 * States grow independently on each CPU, e.g. 2 CPU case:
183 * CPU0 CPU1
184 * ................... ...................
185 * : state RB tree : : state RB tree :
186 * : : : :
187 * : state1 state2 : : state3 :
188 * : | | : : | :
189 * :.....|....|......: :........|........:
190 * | | |
191 * | | |st_rule
192 * | | |
193 * V V V
194 * +-------+ +-------+
195 * | rule1 | | rule1 |
196 * +-------+ +-------+
198 * Tracks are used to enforce limits on the number of sessions. Tracks
199 * are generated by 'limit' option.
201 * The max number of tracks is ipfw_track_max. When we reach the
202 * maximum number of tracks we do not create anymore. This is done to
203 * avoid consuming too much memory.
205 * Tracks are organized into two layers, track counter RB tree is
206 * shared between CPUs, track RB tree is per-cpu. States generated by
207 * 'limit' option are linked to the track in addition to the per-cpu
208 * state RB tree; mainly to ease expiration. e.g. 2 CPU case:
210 * ..............................
211 * : track counter RB tree :
212 * : :
213 * : +-----------+ :
214 * : | trkcnt1 | :
215 * : | | :
216 * : +--->counter<----+ :
217 * : | | | | :
218 * : | +-----------+ | :
219 * :......|................|....:
220 * | |
221 * CPU0 | | CPU1
222 * ................. |t_count | .................
223 * : track RB tree : | | : track RB tree :
224 * : : | | : :
225 * : +-->track1-------+ +--------track2 :
226 * : | A : : :
227 * : | | : : :
228 * :.|.....|.......: :...............:
229 * | +----------------+
230 * | .................... |
231 * | : state RB tree : |st_track
232 * | : : |
233 * +---state1 state2---+
234 * : | | :
235 * :.....|.......|....:
236 * | |
237 * | |st_rule
238 * V V
239 * +----------+
240 * | rule1 |
241 * +----------+
244 #define IPFW_AUTOINC_STEP_MIN 1
245 #define IPFW_AUTOINC_STEP_MAX 1000
246 #define IPFW_AUTOINC_STEP_DEF 100
248 #define IPFW_TABLE_MAX_DEF 64
250 #define IPFW_DEFAULT_RULE 65535 /* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET 31 /* set number for the default rule */
253 #define MATCH_REVERSE 0
254 #define MATCH_FORWARD 1
255 #define MATCH_NONE 2
256 #define MATCH_UNKNOWN 3
258 #define TIME_LEQ(a, b) ((a) - (b) <= 0)
260 #define IPFW_STATE_TCPFLAGS (TH_SYN | TH_FIN | TH_RST)
261 #define IPFW_STATE_TCPSTATES (IPFW_STATE_TCPFLAGS | \
262 (IPFW_STATE_TCPFLAGS << 8))
264 #define BOTH_SYN (TH_SYN | (TH_SYN << 8))
265 #define BOTH_FIN (TH_FIN | (TH_FIN << 8))
266 #define BOTH_RST (TH_RST | (TH_RST << 8))
267 /* TH_ACK here means FIN was ACKed. */
268 #define BOTH_FINACK (TH_ACK | (TH_ACK << 8))
270 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP && \
271 (((s)->st_state & BOTH_RST) || \
272 ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
274 #define O_ANCHOR O_NOP
276 #define IPFW_ISXLAT(type) ((type) == O_REDIRECT)
277 #define IPFW_XLAT_INVALID(s) (IPFW_ISXLAT((s)->st_type) && \
278 ((struct ipfw_xlat *)(s))->xlat_invalid)
280 #define IPFW_MBUF_XLATINS FW_MBUF_PRIVATE1
281 #define IPFW_MBUF_XLATFWD FW_MBUF_PRIVATE2
283 #define IPFW_XLATE_INSERT 0x0001
284 #define IPFW_XLATE_FORWARD 0x0002
285 #define IPFW_XLATE_OUTPUT 0x0004
287 struct netmsg_ipfw {
288 struct netmsg_base base;
289 const struct ipfw_ioc_rule *ioc_rule;
290 struct ip_fw *next_rule;
291 struct ip_fw *prev_rule;
292 struct ip_fw *sibling;
293 uint32_t rule_flags;
294 struct ip_fw **cross_rules;
297 struct netmsg_del {
298 struct netmsg_base base;
299 struct ip_fw *start_rule;
300 struct ip_fw *prev_rule;
301 uint16_t rulenum;
302 uint8_t from_set;
303 uint8_t to_set;
306 struct netmsg_zent {
307 struct netmsg_base base;
308 struct ip_fw *start_rule;
309 uint16_t rulenum;
310 uint16_t log_only;
313 struct netmsg_cpstate {
314 struct netmsg_base base;
315 struct ipfw_ioc_state *ioc_state;
316 int state_cntmax;
317 int state_cnt;
320 struct netmsg_tblent {
321 struct netmsg_base base;
322 struct sockaddr *key;
323 struct sockaddr *netmask;
324 struct ipfw_tblent *sibling;
325 int tableid;
328 struct netmsg_tblflush {
329 struct netmsg_base base;
330 int tableid;
331 int destroy;
334 struct netmsg_tblexp {
335 struct netmsg_base base;
336 time_t expire;
337 int tableid;
338 int cnt;
339 int expcnt;
340 struct radix_node_head *rnh;
343 struct ipfw_table_cp {
344 struct ipfw_ioc_tblent *te;
345 int te_idx;
346 int te_cnt;
349 struct ip_fw_local {
351 * offset The offset of a fragment. offset != 0 means that
352 * we have a fragment at this offset of an IPv4 packet.
353 * offset == 0 means that (if this is an IPv4 packet)
354 * this is the first or only fragment.
356 u_short offset;
359 * Local copies of addresses. They are only valid if we have
360 * an IP packet.
362 * proto The protocol. Set to 0 for non-ip packets,
363 * or to the protocol read from the packet otherwise.
364 * proto != 0 means that we have an IPv4 packet.
366 * src_port, dst_port port numbers, in HOST format. Only
367 * valid for TCP and UDP packets.
369 * src_ip, dst_ip ip addresses, in NETWORK format.
370 * Only valid for IPv4 packets.
372 uint8_t proto;
373 uint16_t src_port; /* NOTE: host format */
374 uint16_t dst_port; /* NOTE: host format */
375 struct in_addr src_ip; /* NOTE: network format */
376 struct in_addr dst_ip; /* NOTE: network format */
377 uint16_t ip_len;
378 struct tcphdr *tcp;
381 struct ipfw_addrs {
382 uint32_t addr1; /* host byte order */
383 uint32_t addr2; /* host byte order */
386 struct ipfw_ports {
387 uint16_t port1; /* host byte order */
388 uint16_t port2; /* host byte order */
391 struct ipfw_key {
392 union {
393 struct ipfw_addrs addrs;
394 uint64_t value;
395 } addr_u;
396 union {
397 struct ipfw_ports ports;
398 uint32_t value;
399 } port_u;
400 uint8_t proto;
401 uint8_t swap; /* IPFW_KEY_SWAP_ */
402 uint16_t rsvd2;
405 #define IPFW_KEY_SWAP_ADDRS 0x1
406 #define IPFW_KEY_SWAP_PORTS 0x2
407 #define IPFW_KEY_SWAP_ALL (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
409 struct ipfw_trkcnt {
410 RB_ENTRY(ipfw_trkcnt) tc_rblink;
411 struct ipfw_key tc_key;
412 uintptr_t tc_ruleid;
413 int tc_refs;
414 int tc_count;
415 time_t tc_expire; /* userland get-only */
416 uint16_t tc_rulenum; /* userland get-only */
417 } __cachealign;
419 #define tc_addrs tc_key.addr_u.value
420 #define tc_ports tc_key.port_u.value
421 #define tc_proto tc_key.proto
422 #define tc_saddr tc_key.addr_u.addrs.addr1
423 #define tc_daddr tc_key.addr_u.addrs.addr2
424 #define tc_sport tc_key.port_u.ports.port1
425 #define tc_dport tc_key.port_u.ports.port2
427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
429 struct ipfw_state;
431 struct ipfw_track {
432 RB_ENTRY(ipfw_track) t_rblink;
433 struct ipfw_key t_key;
434 struct ip_fw *t_rule;
435 time_t t_lastexp;
436 LIST_HEAD(, ipfw_state) t_state_list;
437 time_t t_expire;
438 volatile int *t_count;
439 struct ipfw_trkcnt *t_trkcnt;
440 TAILQ_ENTRY(ipfw_track) t_link;
443 #define t_addrs t_key.addr_u.value
444 #define t_ports t_key.port_u.value
445 #define t_proto t_key.proto
446 #define t_saddr t_key.addr_u.addrs.addr1
447 #define t_daddr t_key.addr_u.addrs.addr2
448 #define t_sport t_key.port_u.ports.port1
449 #define t_dport t_key.port_u.ports.port2
451 RB_HEAD(ipfw_track_tree, ipfw_track);
452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
454 struct ipfw_state {
455 RB_ENTRY(ipfw_state) st_rblink;
456 struct ipfw_key st_key;
458 time_t st_expire; /* expire time */
459 struct ip_fw *st_rule;
461 uint64_t st_pcnt; /* packets */
462 uint64_t st_bcnt; /* bytes */
465 * st_state:
466 * State of this rule, typically a combination of TCP flags.
468 * st_ack_fwd/st_ack_rev:
469 * Most recent ACKs in forward and reverse direction. They
470 * are used to generate keepalives.
472 uint32_t st_state;
473 uint32_t st_ack_fwd; /* host byte order */
474 uint32_t st_seq_fwd; /* host byte order */
475 uint32_t st_ack_rev; /* host byte order */
476 uint32_t st_seq_rev; /* host byte order */
478 uint16_t st_flags; /* IPFW_STATE_F_ */
479 uint16_t st_type; /* KEEP_STATE/LIMIT/RDR */
480 struct ipfw_track *st_track;
482 LIST_ENTRY(ipfw_state) st_trklink;
483 TAILQ_ENTRY(ipfw_state) st_link;
486 #define st_addrs st_key.addr_u.value
487 #define st_ports st_key.port_u.value
488 #define st_proto st_key.proto
489 #define st_swap st_key.swap
491 #define IPFW_STATE_F_ACKFWD 0x0001
492 #define IPFW_STATE_F_SEQFWD 0x0002
493 #define IPFW_STATE_F_ACKREV 0x0004
494 #define IPFW_STATE_F_SEQREV 0x0008
495 #define IPFW_STATE_F_XLATSRC 0x0010
496 #define IPFW_STATE_F_XLATSLAVE 0x0020
497 #define IPFW_STATE_F_LINKED 0x0040
499 #define IPFW_STATE_SCANSKIP(s) ((s)->st_type == O_ANCHOR || \
500 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
502 /* Expired or being deleted. */
503 #define IPFW_STATE_ISDEAD(s) (TIME_LEQ((s)->st_expire, time_uptime) || \
504 IPFW_XLAT_INVALID((s)))
506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
507 RB_HEAD(ipfw_state_tree, ipfw_state);
509 struct ipfw_xlat {
510 struct ipfw_state xlat_st; /* MUST be the first field */
511 uint32_t xlat_addr; /* network byte order */
512 uint16_t xlat_port; /* network byte order */
513 uint16_t xlat_dir; /* MATCH_ */
514 struct ifnet *xlat_ifp; /* matching ifnet */
515 struct ipfw_xlat *xlat_pair; /* paired state */
516 int xlat_pcpu; /* paired cpu */
517 volatile int xlat_invalid; /* invalid, but not dtor yet */
518 volatile uint64_t xlat_crefs; /* cross references */
519 struct netmsg_base xlat_freenm; /* for remote free */
522 #define xlat_type xlat_st.st_type
523 #define xlat_flags xlat_st.st_flags
524 #define xlat_rule xlat_st.st_rule
525 #define xlat_bcnt xlat_st.st_bcnt
526 #define xlat_pcnt xlat_st.st_pcnt
528 struct ipfw_tblent {
529 struct radix_node te_nodes[2];
530 struct sockaddr_in te_key;
531 u_long te_use;
532 time_t te_lastuse;
533 struct ipfw_tblent *te_sibling;
534 volatile int te_expired;
537 struct ipfw_context {
538 struct ip_fw *ipfw_layer3_chain; /* rules for layer3 */
539 struct ip_fw *ipfw_default_rule; /* default rule */
540 uint64_t ipfw_norule_counter; /* ipfw_log(NULL) stat*/
543 * ipfw_set_disable contains one bit per set value (0..31).
544 * If the bit is set, all rules with the corresponding set
545 * are disabled. Set IPDW_DEFAULT_SET is reserved for the
546 * default rule and CANNOT be disabled.
548 uint32_t ipfw_set_disable;
550 uint8_t ipfw_flags; /* IPFW_FLAG_ */
552 struct ip_fw *ipfw_cont_rule;
553 struct ipfw_xlat *ipfw_cont_xlat;
555 struct ipfw_state_tree ipfw_state_tree;
556 struct ipfw_state_list ipfw_state_list;
557 int ipfw_state_loosecnt;
558 int ipfw_state_cnt;
560 union {
561 struct ipfw_state state;
562 struct ipfw_track track;
563 struct ipfw_trkcnt trkcnt;
564 } ipfw_tmpkey;
566 struct ipfw_track_tree ipfw_track_tree;
567 struct ipfw_track_list ipfw_track_list;
568 struct ipfw_trkcnt *ipfw_trkcnt_spare;
570 struct callout ipfw_stateto_ch;
571 time_t ipfw_state_lastexp;
572 struct netmsg_base ipfw_stateexp_nm;
573 struct netmsg_base ipfw_stateexp_more;
574 struct ipfw_state ipfw_stateexp_anch;
576 struct callout ipfw_trackto_ch;
577 time_t ipfw_track_lastexp;
578 struct netmsg_base ipfw_trackexp_nm;
579 struct netmsg_base ipfw_trackexp_more;
580 struct ipfw_track ipfw_trackexp_anch;
582 struct callout ipfw_keepalive_ch;
583 struct netmsg_base ipfw_keepalive_nm;
584 struct netmsg_base ipfw_keepalive_more;
585 struct ipfw_state ipfw_keepalive_anch;
587 struct callout ipfw_xlatreap_ch;
588 struct netmsg_base ipfw_xlatreap_nm;
589 struct ipfw_state_list ipfw_xlatreap;
592 * Statistics
594 u_long ipfw_sts_reap;
595 u_long ipfw_sts_reapfailed;
596 u_long ipfw_sts_overflow;
597 u_long ipfw_sts_nomem;
598 u_long ipfw_sts_tcprecycled;
600 u_long ipfw_tks_nomem;
601 u_long ipfw_tks_reap;
602 u_long ipfw_tks_reapfailed;
603 u_long ipfw_tks_overflow;
604 u_long ipfw_tks_cntnomem;
606 u_long ipfw_frags;
607 u_long ipfw_defraged;
608 u_long ipfw_defrag_remote;
610 u_long ipfw_xlated;
611 u_long ipfw_xlate_split;
612 u_long ipfw_xlate_conflicts;
613 u_long ipfw_xlate_cresolved;
615 /* Last field */
616 struct radix_node_head *ipfw_tables[];
619 #define IPFW_FLAG_KEEPALIVE 0x01
620 #define IPFW_FLAG_STATEEXP 0x02
621 #define IPFW_FLAG_TRACKEXP 0x04
622 #define IPFW_FLAG_STATEREAP 0x08
623 #define IPFW_FLAG_TRACKREAP 0x10
625 #define ipfw_state_tmpkey ipfw_tmpkey.state
626 #define ipfw_track_tmpkey ipfw_tmpkey.track
627 #define ipfw_trkcnt_tmpkey ipfw_tmpkey.trkcnt
629 struct ipfw_global {
630 int ipfw_state_loosecnt; /* cache aligned */
631 time_t ipfw_state_globexp __cachealign;
633 struct lwkt_token ipfw_trkcnt_token __cachealign;
634 struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
635 int ipfw_trkcnt_cnt;
636 time_t ipfw_track_globexp;
638 /* Accessed in netisr0. */
639 struct ip_fw *ipfw_crossref_free __cachealign;
640 struct callout ipfw_crossref_ch;
641 struct netmsg_base ipfw_crossref_nm;
643 #ifdef KLD_MODULE
645 * Module can not be unloaded, if there are references to
646 * certains rules of ipfw(4), e.g. dummynet(4)
648 int ipfw_refcnt __cachealign;
649 #endif
650 } __cachealign;
652 static struct ipfw_context *ipfw_ctx[MAXCPU];
654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
657 * Following two global variables are accessed and updated only
658 * in netisr0.
660 static uint32_t static_count; /* # of static rules */
661 static uint32_t static_ioc_len; /* bytes of static rules */
664 * If 1, then ipfw static rules are being flushed,
665 * ipfw_chk() will skip to the default rule.
667 static int ipfw_flushing;
669 static int fw_verbose;
670 static int verbose_limit;
672 static int fw_debug;
673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
675 static int ipfw_table_max = IPFW_TABLE_MAX_DEF;
677 static int ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
678 static int ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
684 "Firewall statistics");
686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
687 &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
689 &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
690 "Rule number autincrement step");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
692 &fw_one_pass, 0,
693 "Only do a single pass through ipfw when using dummynet(4)");
694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
695 &fw_debug, 0, "Enable printing of debug ip_fw statements");
696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
697 &fw_verbose, 0, "Log matches to ipfw rules");
698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
699 &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
701 &ipfw_table_max, 0, "Max # of tables");
703 static int ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
704 static int ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
705 static int ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
706 static int ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
707 static int ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
708 static int ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
711 * Timeouts for various events in handing states.
713 * NOTE:
714 * 1 == 0~1 second.
715 * 2 == 1~2 second(s).
717 * We use 2 seconds for FIN lifetime, so that the states will not be
718 * ripped prematurely.
720 static uint32_t dyn_ack_lifetime = 300;
721 static uint32_t dyn_syn_lifetime = 20;
722 static uint32_t dyn_finwait_lifetime = 20;
723 static uint32_t dyn_fin_lifetime = 2;
724 static uint32_t dyn_rst_lifetime = 2;
725 static uint32_t dyn_udp_lifetime = 10;
726 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
729 * Keepalives are sent if dyn_keepalive is set. They are sent every
730 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
731 * seconds of lifetime of a rule.
733 static uint32_t dyn_keepalive_interval = 20;
734 static uint32_t dyn_keepalive_period = 5;
735 static uint32_t dyn_keepalive = 1; /* do send keepalives */
737 static struct ipfw_global ipfw_gd;
738 static int ipfw_state_loosecnt_updthr;
739 static int ipfw_state_max = 4096; /* max # of states */
740 static int ipfw_track_max = 4096; /* max # of tracks */
742 static int ipfw_state_headroom; /* setup at module load time */
743 static int ipfw_state_reap_min = 8;
744 static int ipfw_state_expire_max = 32;
745 static int ipfw_state_scan_max = 256;
746 static int ipfw_keepalive_max = 8;
747 static int ipfw_track_reap_max = 4;
748 static int ipfw_track_expire_max = 16;
749 static int ipfw_track_scan_max = 128;
751 static eventhandler_tag ipfw_ifaddr_event;
753 /* Compat */
754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
755 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
756 "Number of states and tracks");
757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
758 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
759 "Max number of states and tracks");
761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
762 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
763 "Number of states");
764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
765 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
766 "Max number of states");
767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
768 &ipfw_state_headroom, 0, "headroom for state reap");
769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
770 &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
772 &ipfw_track_max, 0, "Max number of tracks");
773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
774 &static_count, 0, "Number of static rules");
775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
776 &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
778 &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
780 &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
782 &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
784 &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
786 &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
788 &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
790 &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
792 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
793 "I", "# of states to scan for each expire iteration");
794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
795 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
796 "I", "# of states to expire for each expire iteration");
797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
798 CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
799 "I", "# of states to expire for each expire iteration");
800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
801 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
802 "I", "# of states to reap for state shortage");
803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
804 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
805 "I", "# of tracks to scan for each expire iteration");
806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
807 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
808 "I", "# of tracks to expire for each expire iteration");
809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
810 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
811 "I", "# of tracks to reap for track shortage");
813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
814 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
815 __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
816 "LU", "# of state reaps due to states shortage");
817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
818 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
819 __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
820 "LU", "# of state reap failure");
821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
822 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
823 __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
824 "LU", "# of state overflow");
825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
826 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
827 __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
828 "LU", "# of state allocation failure");
829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
830 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
831 __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
832 "LU", "# of state deleted due to fast TCP port recycling");
834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
835 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
836 __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
837 "LU", "# of track allocation failure");
838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
839 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
840 __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
841 "LU", "# of track reap due to tracks shortage");
842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
843 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
844 __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
845 "LU", "# of track reap failure");
846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
847 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
848 __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
849 "LU", "# of track overflow");
850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
851 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
852 __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
853 "LU", "# of track counter allocation failure");
854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
855 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
856 __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
857 "LU", "# of IP fragements defraged");
858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
859 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
860 __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
861 "LU", "# of IP packets after defrag");
862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
863 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
864 __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
865 "LU", "# of IP packets after defrag dispatched to remote cpus");
866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
867 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
868 __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
869 "LU", "# address/port translations");
870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
871 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
872 __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
873 "LU", "# address/port translations split between different cpus");
874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
875 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
876 __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
877 "LU", "# address/port translations conflicts on remote cpu");
878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
879 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
880 __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
881 "LU", "# address/port translations conflicts resolved on remote cpu");
883 static int ipfw_state_cmp(struct ipfw_state *,
884 struct ipfw_state *);
885 static int ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
886 struct ipfw_trkcnt *);
887 static int ipfw_track_cmp(struct ipfw_track *,
888 struct ipfw_track *);
890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
899 static int ipfw_chk(struct ip_fw_args *);
900 static void ipfw_track_expire_ipifunc(void *);
901 static void ipfw_state_expire_ipifunc(void *);
902 static void ipfw_keepalive(void *);
903 static int ipfw_state_expire_start(struct ipfw_context *,
904 int, int);
905 static void ipfw_crossref_timeo(void *);
906 static void ipfw_state_remove(struct ipfw_context *,
907 struct ipfw_state *);
908 static void ipfw_xlat_reap_timeo(void *);
909 static void ipfw_defrag_redispatch(struct mbuf *, int,
910 struct ip_fw *);
912 #define IPFW_TRKCNT_TOKGET lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKREL lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
914 #define IPFW_TRKCNT_TOKINIT \
915 lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
917 static void
918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
919 const struct sockaddr *netmask)
921 const u_char *cp1 = (const u_char *)src;
922 u_char *cp2 = (u_char *)dst;
923 const u_char *cp3 = (const u_char *)netmask;
924 u_char *cplim = cp2 + *cp3;
925 u_char *cplim2 = cp2 + *cp1;
927 *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
928 cp3 += 2;
929 if (cplim > cplim2)
930 cplim = cplim2;
931 while (cp2 < cplim)
932 *cp2++ = *cp1++ & *cp3++;
933 if (cp2 < cplim2)
934 bzero(cp2, cplim2 - cp2);
937 static __inline uint16_t
938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
940 uint32_t l;
942 if (udp && !cksum)
943 return (0x0000);
944 l = cksum + old - new;
945 l = (l >> 16) + (l & 65535);
946 l = l & 65535;
947 if (udp && !l)
948 return (0xFFFF);
949 return (l);
952 static __inline void
953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
954 in_addr_t daddr, uint16_t dport, uint8_t proto)
957 key->proto = proto;
958 key->swap = 0;
960 if (saddr < daddr) {
961 key->addr_u.addrs.addr1 = daddr;
962 key->addr_u.addrs.addr2 = saddr;
963 key->swap |= IPFW_KEY_SWAP_ADDRS;
964 } else {
965 key->addr_u.addrs.addr1 = saddr;
966 key->addr_u.addrs.addr2 = daddr;
969 if (sport < dport) {
970 key->port_u.ports.port1 = dport;
971 key->port_u.ports.port2 = sport;
972 key->swap |= IPFW_KEY_SWAP_PORTS;
973 } else {
974 key->port_u.ports.port1 = sport;
975 key->port_u.ports.port2 = dport;
978 if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
979 key->swap |= IPFW_KEY_SWAP_PORTS;
980 if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
981 key->swap |= IPFW_KEY_SWAP_ADDRS;
984 static __inline void
985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
986 in_addr_t *daddr, uint16_t *dport)
989 if (key->swap & IPFW_KEY_SWAP_ADDRS) {
990 *saddr = key->addr_u.addrs.addr2;
991 *daddr = key->addr_u.addrs.addr1;
992 } else {
993 *saddr = key->addr_u.addrs.addr1;
994 *daddr = key->addr_u.addrs.addr2;
997 if (key->swap & IPFW_KEY_SWAP_PORTS) {
998 *sport = key->port_u.ports.port2;
999 *dport = key->port_u.ports.port1;
1000 } else {
1001 *sport = key->port_u.ports.port1;
1002 *dport = key->port_u.ports.port2;
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1010 if (s1->st_proto > s2->st_proto)
1011 return (1);
1012 if (s1->st_proto < s2->st_proto)
1013 return (-1);
1015 if (s1->st_addrs > s2->st_addrs)
1016 return (1);
1017 if (s1->st_addrs < s2->st_addrs)
1018 return (-1);
1020 if (s1->st_ports > s2->st_ports)
1021 return (1);
1022 if (s1->st_ports < s2->st_ports)
1023 return (-1);
1025 if (s1->st_swap == s2->st_swap ||
1026 (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027 return (0);
1029 if (s1->st_swap > s2->st_swap)
1030 return (1);
1031 else
1032 return (-1);
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1039 if (t1->tc_proto > t2->tc_proto)
1040 return (1);
1041 if (t1->tc_proto < t2->tc_proto)
1042 return (-1);
1044 if (t1->tc_addrs > t2->tc_addrs)
1045 return (1);
1046 if (t1->tc_addrs < t2->tc_addrs)
1047 return (-1);
1049 if (t1->tc_ports > t2->tc_ports)
1050 return (1);
1051 if (t1->tc_ports < t2->tc_ports)
1052 return (-1);
1054 if (t1->tc_ruleid > t2->tc_ruleid)
1055 return (1);
1056 if (t1->tc_ruleid < t2->tc_ruleid)
1057 return (-1);
1059 return (0);
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1066 if (t1->t_proto > t2->t_proto)
1067 return (1);
1068 if (t1->t_proto < t2->t_proto)
1069 return (-1);
1071 if (t1->t_addrs > t2->t_addrs)
1072 return (1);
1073 if (t1->t_addrs < t2->t_addrs)
1074 return (-1);
1076 if (t1->t_ports > t2->t_ports)
1077 return (1);
1078 if (t1->t_ports < t2->t_ports)
1079 return (-1);
1081 if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082 return (1);
1083 if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084 return (-1);
1086 return (0);
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1092 struct ipfw_state *dup;
1094 KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095 ("state %p was linked", s));
1096 dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097 if (dup == NULL) {
1098 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099 s->st_flags |= IPFW_STATE_F_LINKED;
1101 return (dup);
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1108 KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109 ("state %p was not linked", s));
1110 RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111 TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112 s->st_flags &= ~IPFW_STATE_F_LINKED;
1115 static void
1116 ipfw_state_max_set(int state_max)
1119 ipfw_state_max = state_max;
1120 /* Allow 5% states over-allocation. */
1121 ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1127 int cpu, state_cnt = 0;
1129 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131 return (state_cnt);
1134 static __inline int
1135 ipfw_state_cntsync(void)
1137 int state_cnt;
1139 state_cnt = ipfw_state_cntcoll();
1140 ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141 return (state_cnt);
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1147 KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148 KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149 rule->refcnt--;
1150 if (rule->refcnt == 0) {
1151 if (rule->cross_rules != NULL)
1152 kfree(rule->cross_rules, M_IPFW);
1153 kfree(rule, M_IPFW);
1154 return 1;
1156 return 0;
1159 static void
1160 ipfw_unref_rule(void *priv)
1162 ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164 KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165 ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166 atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1173 KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177 rule->refcnt++;
1181 * This macro maps an ip pointer into a layer3 header pointer of type T
1183 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1188 int type = L3HDR(struct icmp,ip)->icmp_type;
1189 int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1190 int idx = type / 32;
1192 if (idx >= idx_max)
1193 return (0);
1194 return (cmd->d[idx] & (1 << (type % 32)));
1197 static __inline int
1198 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1200 int code = L3HDR(struct icmp,ip)->icmp_code;
1201 int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1202 int idx = code / 32;
1204 if (idx >= idx_max)
1205 return (0);
1206 return (cmd->d[idx] & (1 << (code % 32)));
1209 #define TT ((1 << ICMP_ECHO) | \
1210 (1 << ICMP_ROUTERSOLICIT) | \
1211 (1 << ICMP_TSTAMP) | \
1212 (1 << ICMP_IREQ) | \
1213 (1 << ICMP_MASKREQ))
1215 static int
1216 is_icmp_query(struct ip *ip)
1218 int type = L3HDR(struct icmp, ip)->icmp_type;
1220 return (type < 32 && (TT & (1 << type)));
1223 #undef TT
1226 * The following checks use two arrays of 8 or 16 bits to store the
1227 * bits that we want set or clear, respectively. They are in the
1228 * low and high half of cmd->arg1 or cmd->d[0].
1230 * We scan options and store the bits we find set. We succeed if
1232 * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1234 * The code is sometimes optimized not to store additional variables.
1236 static int
1237 flags_match(ipfw_insn *cmd, uint8_t bits)
1239 u_char want_clear;
1240 bits = ~bits;
1242 if (((cmd->arg1 & 0xff) & bits) != 0)
1243 return 0; /* some bits we want set were clear */
1245 want_clear = (cmd->arg1 >> 8) & 0xff;
1246 if ((want_clear & bits) != want_clear)
1247 return 0; /* some bits we want clear were set */
1248 return 1;
1251 static int
1252 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1254 int optlen, bits = 0;
1255 u_char *cp = (u_char *)(ip + 1);
1256 int x = (ip->ip_hl << 2) - sizeof(struct ip);
1258 for (; x > 0; x -= optlen, cp += optlen) {
1259 int opt = cp[IPOPT_OPTVAL];
1261 if (opt == IPOPT_EOL)
1262 break;
1264 if (opt == IPOPT_NOP) {
1265 optlen = 1;
1266 } else {
1267 optlen = cp[IPOPT_OLEN];
1268 if (optlen <= 0 || optlen > x)
1269 return 0; /* invalid or truncated */
1272 switch (opt) {
1273 case IPOPT_LSRR:
1274 bits |= IP_FW_IPOPT_LSRR;
1275 break;
1277 case IPOPT_SSRR:
1278 bits |= IP_FW_IPOPT_SSRR;
1279 break;
1281 case IPOPT_RR:
1282 bits |= IP_FW_IPOPT_RR;
1283 break;
1285 case IPOPT_TS:
1286 bits |= IP_FW_IPOPT_TS;
1287 break;
1289 default:
1290 break;
1293 return (flags_match(cmd, bits));
1296 static int
1297 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1299 int optlen, bits = 0;
1300 struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1301 u_char *cp = (u_char *)(tcp + 1);
1302 int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1304 for (; x > 0; x -= optlen, cp += optlen) {
1305 int opt = cp[0];
1307 if (opt == TCPOPT_EOL)
1308 break;
1310 if (opt == TCPOPT_NOP) {
1311 optlen = 1;
1312 } else {
1313 optlen = cp[1];
1314 if (optlen <= 0)
1315 break;
1318 switch (opt) {
1319 case TCPOPT_MAXSEG:
1320 bits |= IP_FW_TCPOPT_MSS;
1321 break;
1323 case TCPOPT_WINDOW:
1324 bits |= IP_FW_TCPOPT_WINDOW;
1325 break;
1327 case TCPOPT_SACK_PERMITTED:
1328 case TCPOPT_SACK:
1329 bits |= IP_FW_TCPOPT_SACK;
1330 break;
1332 case TCPOPT_TIMESTAMP:
1333 bits |= IP_FW_TCPOPT_TS;
1334 break;
1336 case TCPOPT_CC:
1337 case TCPOPT_CCNEW:
1338 case TCPOPT_CCECHO:
1339 bits |= IP_FW_TCPOPT_CC;
1340 break;
1342 default:
1343 break;
1346 return (flags_match(cmd, bits));
1349 static int
1350 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1352 if (ifp == NULL) /* no iface with this packet, match fails */
1353 return 0;
1355 /* Check by name or by IP address */
1356 if (cmd->name[0] != '\0') { /* match by name */
1357 /* Check name */
1358 if (cmd->p.glob) {
1359 if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1360 return(1);
1361 } else {
1362 if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1363 return(1);
1365 } else {
1366 struct ifaddr_container *ifac;
1368 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1369 struct ifaddr *ia = ifac->ifa;
1371 if (ia->ifa_addr == NULL)
1372 continue;
1373 if (ia->ifa_addr->sa_family != AF_INET)
1374 continue;
1375 if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1376 (ia->ifa_addr))->sin_addr.s_addr)
1377 return(1); /* match */
1380 return(0); /* no match, fail ... */
1383 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1386 * We enter here when we have a rule with O_LOG.
1387 * XXX this function alone takes about 2Kbytes of code!
1389 static void
1390 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1391 struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1393 char *action;
1394 int limit_reached = 0;
1395 char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1397 fragment[0] = '\0';
1398 proto[0] = '\0';
1400 if (f == NULL) { /* bogus pkt */
1401 if (verbose_limit != 0 &&
1402 ctx->ipfw_norule_counter >= verbose_limit)
1403 return;
1404 ctx->ipfw_norule_counter++;
1405 if (ctx->ipfw_norule_counter == verbose_limit)
1406 limit_reached = verbose_limit;
1407 action = "Refuse";
1408 } else { /* O_LOG is the first action, find the real one */
1409 ipfw_insn *cmd = ACTION_PTR(f);
1410 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1412 if (l->max_log != 0 && l->log_left == 0)
1413 return;
1414 l->log_left--;
1415 if (l->log_left == 0)
1416 limit_reached = l->max_log;
1417 cmd += F_LEN(cmd); /* point to first action */
1418 if (cmd->opcode == O_PROB)
1419 cmd += F_LEN(cmd);
1421 action = action2;
1422 switch (cmd->opcode) {
1423 case O_DENY:
1424 action = "Deny";
1425 break;
1427 case O_REJECT:
1428 if (cmd->arg1==ICMP_REJECT_RST) {
1429 action = "Reset";
1430 } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1431 action = "Reject";
1432 } else {
1433 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1434 cmd->arg1);
1436 break;
1438 case O_ACCEPT:
1439 action = "Accept";
1440 break;
1442 case O_COUNT:
1443 action = "Count";
1444 break;
1446 case O_DIVERT:
1447 ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1448 break;
1450 case O_TEE:
1451 ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1452 break;
1454 case O_SKIPTO:
1455 ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1456 break;
1458 case O_PIPE:
1459 ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1460 break;
1462 case O_QUEUE:
1463 ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1464 break;
1466 case O_FORWARD_IP:
1468 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1469 int len;
1471 len = ksnprintf(SNPARGS(action2, 0),
1472 "Forward to %s",
1473 kinet_ntoa(sa->sa.sin_addr, abuf));
1474 if (sa->sa.sin_port) {
1475 ksnprintf(SNPARGS(action2, len), ":%d",
1476 sa->sa.sin_port);
1479 break;
1481 default:
1482 action = "UNKNOWN";
1483 break;
1487 if (hlen == 0) { /* non-ip */
1488 ksnprintf(SNPARGS(proto, 0), "MAC");
1489 } else {
1490 struct ip *ip = mtod(m, struct ip *);
1491 /* these three are all aliases to the same thing */
1492 struct icmp *const icmp = L3HDR(struct icmp, ip);
1493 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1494 struct udphdr *const udp = (struct udphdr *)icmp;
1496 int ip_off, offset, ip_len;
1497 int len;
1499 if (eh != NULL) { /* layer 2 packets are as on the wire */
1500 ip_off = ntohs(ip->ip_off);
1501 ip_len = ntohs(ip->ip_len);
1502 } else {
1503 ip_off = ip->ip_off;
1504 ip_len = ip->ip_len;
1506 offset = ip_off & IP_OFFMASK;
1507 switch (ip->ip_p) {
1508 case IPPROTO_TCP:
1509 len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1510 kinet_ntoa(ip->ip_src, abuf));
1511 if (offset == 0) {
1512 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1513 ntohs(tcp->th_sport),
1514 kinet_ntoa(ip->ip_dst, abuf),
1515 ntohs(tcp->th_dport));
1516 } else {
1517 ksnprintf(SNPARGS(proto, len), " %s",
1518 kinet_ntoa(ip->ip_dst, abuf));
1520 break;
1522 case IPPROTO_UDP:
1523 len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1524 kinet_ntoa(ip->ip_src, abuf));
1525 if (offset == 0) {
1526 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1527 ntohs(udp->uh_sport),
1528 kinet_ntoa(ip->ip_dst, abuf),
1529 ntohs(udp->uh_dport));
1530 } else {
1531 ksnprintf(SNPARGS(proto, len), " %s",
1532 kinet_ntoa(ip->ip_dst, abuf));
1534 break;
1536 case IPPROTO_ICMP:
1537 if (offset == 0) {
1538 len = ksnprintf(SNPARGS(proto, 0),
1539 "ICMP:%u.%u ",
1540 icmp->icmp_type,
1541 icmp->icmp_code);
1542 } else {
1543 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1545 len += ksnprintf(SNPARGS(proto, len), "%s",
1546 kinet_ntoa(ip->ip_src, abuf));
1547 ksnprintf(SNPARGS(proto, len), " %s",
1548 kinet_ntoa(ip->ip_dst, abuf));
1549 break;
1551 default:
1552 len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1553 kinet_ntoa(ip->ip_src, abuf));
1554 ksnprintf(SNPARGS(proto, len), " %s",
1555 kinet_ntoa(ip->ip_dst, abuf));
1556 break;
1559 if (ip_off & (IP_MF | IP_OFFMASK)) {
1560 ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1561 ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1562 offset << 3, (ip_off & IP_MF) ? "+" : "");
1566 if (oif || m->m_pkthdr.rcvif) {
1567 log(LOG_SECURITY | LOG_INFO,
1568 "ipfw: %d %s %s %s via %s%s\n",
1569 f ? f->rulenum : -1,
1570 action, proto, oif ? "out" : "in",
1571 oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1572 fragment);
1573 } else {
1574 log(LOG_SECURITY | LOG_INFO,
1575 "ipfw: %d %s %s [no if info]%s\n",
1576 f ? f->rulenum : -1,
1577 action, proto, fragment);
1580 if (limit_reached) {
1581 log(LOG_SECURITY | LOG_NOTICE,
1582 "ipfw: limit %d reached on entry %d\n",
1583 limit_reached, f ? f->rulenum : -1);
1587 #undef SNPARGS
1589 static void
1590 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1592 struct ip_fw *rule = slave_x->xlat_rule;
1594 KKASSERT(rule->cpuid == mycpuid);
1596 /* No more cross references; free this pair now. */
1597 kfree(x, M_IPFW);
1598 kfree(slave_x, M_IPFW);
1600 /* See the comment in ipfw_ip_xlate_dispatch(). */
1601 rule->cross_refs--;
1604 static void
1605 ipfw_xlat_reap_dispatch(netmsg_t nm)
1607 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1608 struct ipfw_state *s, *ns;
1610 ASSERT_NETISR_NCPUS(mycpuid);
1612 crit_enter();
1613 /* Reply ASAP. */
1614 netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1615 crit_exit();
1617 /* TODO: limit scanning depth */
1618 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1619 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1620 struct ipfw_xlat *slave_x = x->xlat_pair;
1621 uint64_t crefs;
1623 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1624 if (crefs == 0) {
1625 TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1626 ipfw_xlat_reap(x, slave_x);
1629 if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1630 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1631 &ctx->ipfw_xlatreap_nm);
1635 static void
1636 ipfw_xlat_reap_timeo(void *xnm)
1638 struct netmsg_base *nm = xnm;
1640 KKASSERT(mycpuid < netisr_ncpus);
1642 crit_enter();
1643 if (nm->lmsg.ms_flags & MSGF_DONE)
1644 netisr_sendmsg_oncpu(nm);
1645 crit_exit();
1648 static void
1649 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1651 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1652 struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1653 struct ipfw_xlat *slave_x = x->xlat_pair;
1654 uint64_t crefs;
1656 ASSERT_NETISR_NCPUS(mycpuid);
1658 KKASSERT(slave_x != NULL);
1659 KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1661 KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1662 ("master xlat is still linked"));
1663 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1664 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1666 /* See the comment in ipfw_ip_xlate_dispatch(). */
1667 slave_x->xlat_crefs--;
1669 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1670 if (crefs == 0) {
1671 ipfw_xlat_reap(x, slave_x);
1672 return;
1675 if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1676 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1677 &ctx->ipfw_xlatreap_nm);
1681 * This pair is still referenced; defer its destruction.
1682 * YYY reuse st_link.
1684 TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1687 static __inline void
1688 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1691 x->xlat_invalid = 1;
1692 x->xlat_pair->xlat_invalid = 1;
1695 static void
1696 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1698 struct ipfw_xlat *x, *slave_x;
1699 struct netmsg_base *nm;
1701 KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1702 IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1703 KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1704 ("delete slave xlat"));
1706 KASSERT(ctx->ipfw_state_cnt > 0,
1707 ("invalid state count %d", ctx->ipfw_state_cnt));
1708 ctx->ipfw_state_cnt--;
1709 if (ctx->ipfw_state_loosecnt > 0)
1710 ctx->ipfw_state_loosecnt--;
1713 * Unhook this state.
1715 if (s->st_track != NULL) {
1716 struct ipfw_track *t = s->st_track;
1718 KASSERT(!LIST_EMPTY(&t->t_state_list),
1719 ("track state list is empty"));
1720 LIST_REMOVE(s, st_trklink);
1722 KASSERT(*t->t_count > 0,
1723 ("invalid track count %d", *t->t_count));
1724 atomic_subtract_int(t->t_count, 1);
1726 ipfw_state_unlink(ctx, s);
1729 * Free this state. Xlat requires special processing,
1730 * since xlat are paired state and they could be on
1731 * different cpus.
1734 if (!IPFW_ISXLAT(s->st_type)) {
1735 /* Not xlat; free now. */
1736 kfree(s, M_IPFW);
1737 /* Done! */
1738 return;
1740 x = (struct ipfw_xlat *)s;
1742 if (x->xlat_pair == NULL) {
1743 /* Not setup yet; free now. */
1744 kfree(x, M_IPFW);
1745 /* Done! */
1746 return;
1748 slave_x = x->xlat_pair;
1749 KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1751 if (x->xlat_pcpu == mycpuid) {
1753 * Paired states are on the same cpu; delete this
1754 * pair now.
1756 KKASSERT(x->xlat_crefs == 0);
1757 KKASSERT(slave_x->xlat_crefs == 0);
1758 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1759 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1760 kfree(x, M_IPFW);
1761 kfree(slave_x, M_IPFW);
1762 return;
1766 * Free the paired states on the cpu owning the slave xlat.
1770 * Mark the state pair invalid; completely deleting them
1771 * may take some time.
1773 ipfw_xlat_invalidate(x);
1775 nm = &x->xlat_freenm;
1776 netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1777 ipfw_xlat_free_dispatch);
1778 nm->lmsg.u.ms_resultp = x;
1780 /* See the comment in ipfw_xlate_redispatch(). */
1781 x->xlat_rule->cross_refs++;
1782 x->xlat_crefs++;
1784 netisr_sendmsg(nm, x->xlat_pcpu);
1787 static void
1788 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1791 if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1792 KKASSERT(IPFW_ISXLAT(s->st_type));
1793 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1794 ipfw_state_unlink(ctx, s);
1795 return;
1797 ipfw_state_del(ctx, s);
1800 static int
1801 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1803 struct ipfw_state *s, *anchor;
1804 int expired;
1806 if (reap_max < ipfw_state_reap_min)
1807 reap_max = ipfw_state_reap_min;
1809 if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1811 * Kick start state expiring. Ignore scan limit,
1812 * we are short of states.
1814 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1815 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1816 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1817 return (expired);
1821 * States are being expired.
1824 if (ctx->ipfw_state_cnt == 0)
1825 return (0);
1827 expired = 0;
1828 anchor = &ctx->ipfw_stateexp_anch;
1829 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1831 * Ignore scan limit; we are short of states.
1834 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1835 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1837 if (IPFW_STATE_SCANSKIP(s))
1838 continue;
1840 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1841 ipfw_state_del(ctx, s);
1842 if (++expired >= reap_max)
1843 break;
1844 if ((expired & 0xff) == 0 &&
1845 ipfw_state_cntcoll() + ipfw_state_headroom <=
1846 ipfw_state_max)
1847 break;
1851 * NOTE:
1852 * Leave the anchor on the list, even if the end of the list has
1853 * been reached. ipfw_state_expire_more_dispatch() will handle
1854 * the removal.
1856 return (expired);
1859 static void
1860 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1862 struct ipfw_state *s, *sn;
1864 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1865 if (IPFW_STATE_SCANSKIP(s))
1866 continue;
1867 if (rule != NULL && s->st_rule != rule)
1868 continue;
1869 ipfw_state_del(ctx, s);
1873 static void
1874 ipfw_state_expire_done(struct ipfw_context *ctx)
1877 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1878 ("stateexp is not in progress"));
1879 ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1880 callout_reset(&ctx->ipfw_stateto_ch, hz,
1881 ipfw_state_expire_ipifunc, NULL);
1884 static void
1885 ipfw_state_expire_more(struct ipfw_context *ctx)
1887 struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1889 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1890 ("stateexp is not in progress"));
1891 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1892 ("stateexp more did not finish"));
1893 netisr_sendmsg_oncpu(nm);
1896 static int
1897 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1898 int scan_max, int expire_max)
1900 struct ipfw_state *s;
1901 int scanned = 0, expired = 0;
1903 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1904 ("stateexp is not in progress"));
1906 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1907 if (scanned++ >= scan_max) {
1908 ipfw_state_expire_more(ctx);
1909 return (expired);
1912 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1913 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1915 if (IPFW_STATE_SCANSKIP(s))
1916 continue;
1918 if (IPFW_STATE_ISDEAD(s) ||
1919 ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1920 IPFW_STATE_TCPCLOSED(s))) {
1921 ipfw_state_del(ctx, s);
1922 if (++expired >= expire_max) {
1923 ipfw_state_expire_more(ctx);
1924 return (expired);
1926 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1927 (expired & 0xff) == 0 &&
1928 ipfw_state_cntcoll() + ipfw_state_headroom <=
1929 ipfw_state_max) {
1930 ipfw_state_expire_more(ctx);
1931 return (expired);
1935 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1936 ipfw_state_expire_done(ctx);
1937 return (expired);
1940 static void
1941 ipfw_state_expire_more_dispatch(netmsg_t nm)
1943 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1944 struct ipfw_state *anchor;
1946 ASSERT_NETISR_NCPUS(mycpuid);
1947 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1948 ("statexp is not in progress"));
1950 /* Reply ASAP */
1951 netisr_replymsg(&nm->base, 0);
1953 anchor = &ctx->ipfw_stateexp_anch;
1954 if (ctx->ipfw_state_cnt == 0) {
1955 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1956 ipfw_state_expire_done(ctx);
1957 return;
1959 ipfw_state_expire_loop(ctx, anchor,
1960 ipfw_state_scan_max, ipfw_state_expire_max);
1963 static int
1964 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1966 struct ipfw_state *anchor;
1968 KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1969 ("stateexp is in progress"));
1970 ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1972 if (ctx->ipfw_state_cnt == 0) {
1973 ipfw_state_expire_done(ctx);
1974 return (0);
1978 * Do not expire more than once per second, it is useless.
1980 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1981 ctx->ipfw_state_lastexp == time_uptime) {
1982 ipfw_state_expire_done(ctx);
1983 return (0);
1985 ctx->ipfw_state_lastexp = time_uptime;
1987 anchor = &ctx->ipfw_stateexp_anch;
1988 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1989 return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1992 static void
1993 ipfw_state_expire_dispatch(netmsg_t nm)
1995 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1997 ASSERT_NETISR_NCPUS(mycpuid);
1999 /* Reply ASAP */
2000 crit_enter();
2001 netisr_replymsg(&nm->base, 0);
2002 crit_exit();
2004 if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2005 /* Running; done. */
2006 return;
2008 ipfw_state_expire_start(ctx,
2009 ipfw_state_scan_max, ipfw_state_expire_max);
2012 static void
2013 ipfw_state_expire_ipifunc(void *dummy __unused)
2015 struct netmsg_base *msg;
2017 KKASSERT(mycpuid < netisr_ncpus);
2018 msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2020 crit_enter();
2021 if (msg->lmsg.ms_flags & MSGF_DONE)
2022 netisr_sendmsg_oncpu(msg);
2023 crit_exit();
2026 static boolean_t
2027 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2029 uint32_t seq = ntohl(tcp->th_seq);
2030 uint32_t ack = ntohl(tcp->th_ack);
2032 if (tcp->th_flags & TH_RST)
2033 return (TRUE);
2035 if (dir == MATCH_FORWARD) {
2036 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2037 s->st_flags |= IPFW_STATE_F_SEQFWD;
2038 s->st_seq_fwd = seq;
2039 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2040 s->st_seq_fwd = seq;
2041 } else {
2042 /* Out-of-sequence; done. */
2043 return (FALSE);
2045 if (tcp->th_flags & TH_ACK) {
2046 if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2047 s->st_flags |= IPFW_STATE_F_ACKFWD;
2048 s->st_ack_fwd = ack;
2049 } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2050 s->st_ack_fwd = ack;
2051 } else {
2052 /* Out-of-sequence; done. */
2053 return (FALSE);
2056 if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2057 (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2058 s->st_state |= (TH_ACK << 8);
2060 } else {
2061 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2062 s->st_flags |= IPFW_STATE_F_SEQREV;
2063 s->st_seq_rev = seq;
2064 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2065 s->st_seq_rev = seq;
2066 } else {
2067 /* Out-of-sequence; done. */
2068 return (FALSE);
2070 if (tcp->th_flags & TH_ACK) {
2071 if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2072 s->st_flags |= IPFW_STATE_F_ACKREV;
2073 s->st_ack_rev= ack;
2074 } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2075 s->st_ack_rev = ack;
2076 } else {
2077 /* Out-of-sequence; done. */
2078 return (FALSE);
2081 if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2082 s->st_ack_rev == s->st_seq_fwd + 1)
2083 s->st_state |= TH_ACK;
2086 return (TRUE);
2089 static void
2090 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2091 const struct tcphdr *tcp, struct ipfw_state *s)
2094 if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2095 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2097 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2098 return;
2100 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2101 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2102 case TH_SYN: /* opening */
2103 s->st_expire = time_uptime + dyn_syn_lifetime;
2104 break;
2106 case BOTH_SYN: /* move to established */
2107 case BOTH_SYN | TH_FIN: /* one side tries to close */
2108 case BOTH_SYN | (TH_FIN << 8):
2109 s->st_expire = time_uptime + dyn_ack_lifetime;
2110 break;
2112 case BOTH_SYN | BOTH_FIN: /* both sides closed */
2113 if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2114 /* And both FINs were ACKed. */
2115 s->st_expire = time_uptime + dyn_fin_lifetime;
2116 } else {
2117 s->st_expire = time_uptime +
2118 dyn_finwait_lifetime;
2120 break;
2122 default:
2123 #if 0
2125 * reset or some invalid combination, but can also
2126 * occur if we use keep-state the wrong way.
2128 if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2129 kprintf("invalid state: 0x%x\n", s->st_state);
2130 #endif
2131 s->st_expire = time_uptime + dyn_rst_lifetime;
2132 break;
2134 } else if (pkt->proto == IPPROTO_UDP) {
2135 s->st_expire = time_uptime + dyn_udp_lifetime;
2136 } else {
2137 /* other protocols */
2138 s->st_expire = time_uptime + dyn_short_lifetime;
2143 * Lookup a state.
2145 static struct ipfw_state *
2146 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2147 int *match_direction, const struct tcphdr *tcp)
2149 struct ipfw_state *key, *s;
2150 int dir = MATCH_NONE;
2152 key = &ctx->ipfw_state_tmpkey;
2153 ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2154 pkt->dst_ip, pkt->dst_port, pkt->proto);
2155 s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2156 if (s == NULL)
2157 goto done; /* not found. */
2158 if (IPFW_STATE_ISDEAD(s)) {
2159 ipfw_state_remove(ctx, s);
2160 s = NULL;
2161 goto done;
2163 if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2164 /* TCP ports recycling is too fast. */
2165 ctx->ipfw_sts_tcprecycled++;
2166 ipfw_state_remove(ctx, s);
2167 s = NULL;
2168 goto done;
2171 if (s->st_swap == key->st_swap) {
2172 dir = MATCH_FORWARD;
2173 } else {
2174 KASSERT((s->st_swap & key->st_swap) == 0,
2175 ("found mismatch state"));
2176 dir = MATCH_REVERSE;
2179 /* Update this state. */
2180 ipfw_state_update(pkt, dir, tcp, s);
2182 if (s->st_track != NULL) {
2183 /* This track has been used. */
2184 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2186 done:
2187 if (match_direction)
2188 *match_direction = dir;
2189 return (s);
2192 static struct ipfw_state *
2193 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2194 uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2196 struct ipfw_state *s;
2197 size_t sz;
2199 KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2200 ("invalid state type %u", type));
2202 sz = sizeof(struct ipfw_state);
2203 if (IPFW_ISXLAT(type))
2204 sz = sizeof(struct ipfw_xlat);
2206 s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2207 if (s == NULL) {
2208 ctx->ipfw_sts_nomem++;
2209 return (NULL);
2212 ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2213 id->dst_ip, id->dst_port, id->proto);
2215 s->st_rule = rule;
2216 s->st_type = type;
2217 if (IPFW_ISXLAT(type)) {
2218 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2220 x->xlat_dir = MATCH_NONE;
2221 x->xlat_pcpu = -1;
2225 * Update this state:
2226 * Set st_expire and st_state.
2228 ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2230 return (s);
2233 static struct ipfw_state *
2234 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2235 uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2236 const struct tcphdr *tcp)
2238 struct ipfw_state *s, *dup;
2240 s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2241 if (s == NULL)
2242 return (NULL);
2244 ctx->ipfw_state_cnt++;
2245 ctx->ipfw_state_loosecnt++;
2246 if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2247 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2248 ctx->ipfw_state_loosecnt = 0;
2251 dup = ipfw_state_link(ctx, s);
2252 if (dup != NULL)
2253 panic("ipfw: %u state exists %p", type, dup);
2255 if (t != NULL) {
2256 /* Keep the track referenced. */
2257 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2258 s->st_track = t;
2260 return (s);
2263 static boolean_t
2264 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2266 struct ipfw_trkcnt *trk;
2267 boolean_t trk_freed = FALSE;
2269 KASSERT(t->t_count != NULL, ("track anchor"));
2270 KASSERT(LIST_EMPTY(&t->t_state_list),
2271 ("invalid track is still referenced"));
2273 trk = t->t_trkcnt;
2274 KASSERT(trk != NULL, ("track has no trkcnt"));
2276 RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2277 TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2278 kfree(t, M_IPFW);
2281 * fdrop() style reference counting.
2282 * See kern/kern_descrip.c fdrop().
2284 for (;;) {
2285 int refs = trk->tc_refs;
2287 cpu_ccfence();
2288 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2289 if (refs == 1) {
2290 IPFW_TRKCNT_TOKGET;
2291 if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2292 KASSERT(trk->tc_count == 0,
2293 ("%d states reference this trkcnt",
2294 trk->tc_count));
2295 RB_REMOVE(ipfw_trkcnt_tree,
2296 &ipfw_gd.ipfw_trkcnt_tree, trk);
2298 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2299 ("invalid trkcnt cnt %d",
2300 ipfw_gd.ipfw_trkcnt_cnt));
2301 ipfw_gd.ipfw_trkcnt_cnt--;
2302 IPFW_TRKCNT_TOKREL;
2304 if (ctx->ipfw_trkcnt_spare == NULL)
2305 ctx->ipfw_trkcnt_spare = trk;
2306 else
2307 kfree(trk, M_IPFW);
2308 trk_freed = TRUE;
2309 break; /* done! */
2311 IPFW_TRKCNT_TOKREL;
2312 /* retry */
2313 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2314 break; /* done! */
2316 /* retry */
2318 return (trk_freed);
2321 static void
2322 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2324 struct ipfw_track *t, *tn;
2326 TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2327 if (t->t_count == NULL) /* anchor */
2328 continue;
2329 if (rule != NULL && t->t_rule != rule)
2330 continue;
2331 ipfw_track_free(ctx, t);
2335 static boolean_t
2336 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2337 boolean_t reap)
2339 struct ipfw_state *s, *sn;
2340 boolean_t ret = FALSE;
2342 KASSERT(t->t_count != NULL, ("track anchor"));
2344 if (LIST_EMPTY(&t->t_state_list))
2345 return (FALSE);
2348 * Do not expire more than once per second, it is useless.
2350 if (t->t_lastexp == time_uptime)
2351 return (FALSE);
2352 t->t_lastexp = time_uptime;
2354 LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2355 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2356 KASSERT(s->st_track == t,
2357 ("state track %p does not match %p",
2358 s->st_track, t));
2359 ipfw_state_del(ctx, s);
2360 ret = TRUE;
2363 return (ret);
2366 static __inline struct ipfw_trkcnt *
2367 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2369 struct ipfw_trkcnt *trk;
2371 if (ctx->ipfw_trkcnt_spare != NULL) {
2372 trk = ctx->ipfw_trkcnt_spare;
2373 ctx->ipfw_trkcnt_spare = NULL;
2374 } else {
2375 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2376 M_INTWAIT | M_NULLOK);
2378 return (trk);
2381 static void
2382 ipfw_track_expire_done(struct ipfw_context *ctx)
2385 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2386 ("trackexp is not in progress"));
2387 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2388 callout_reset(&ctx->ipfw_trackto_ch, hz,
2389 ipfw_track_expire_ipifunc, NULL);
2392 static void
2393 ipfw_track_expire_more(struct ipfw_context *ctx)
2395 struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2397 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2398 ("trackexp is not in progress"));
2399 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2400 ("trackexp more did not finish"));
2401 netisr_sendmsg_oncpu(nm);
2404 static int
2405 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2406 int scan_max, int expire_max)
2408 struct ipfw_track *t;
2409 int scanned = 0, expired = 0;
2410 boolean_t reap = FALSE;
2412 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2413 ("trackexp is not in progress"));
2415 if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2416 reap = TRUE;
2418 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2419 if (scanned++ >= scan_max) {
2420 ipfw_track_expire_more(ctx);
2421 return (expired);
2424 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2425 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2427 if (t->t_count == NULL) /* anchor */
2428 continue;
2430 ipfw_track_state_expire(ctx, t, reap);
2431 if (!LIST_EMPTY(&t->t_state_list)) {
2432 /* There are states referencing this track. */
2433 continue;
2436 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2437 /* Expired. */
2438 if (ipfw_track_free(ctx, t)) {
2439 if (++expired >= expire_max) {
2440 ipfw_track_expire_more(ctx);
2441 return (expired);
2446 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2447 ipfw_track_expire_done(ctx);
2448 return (expired);
2451 static int
2452 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2454 struct ipfw_track *anchor;
2456 KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2457 ("trackexp is in progress"));
2458 ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2460 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2461 ipfw_track_expire_done(ctx);
2462 return (0);
2466 * Do not expire more than once per second, it is useless.
2468 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2469 ctx->ipfw_track_lastexp == time_uptime) {
2470 ipfw_track_expire_done(ctx);
2471 return (0);
2473 ctx->ipfw_track_lastexp = time_uptime;
2475 anchor = &ctx->ipfw_trackexp_anch;
2476 TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2477 return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2480 static void
2481 ipfw_track_expire_more_dispatch(netmsg_t nm)
2483 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2484 struct ipfw_track *anchor;
2486 ASSERT_NETISR_NCPUS(mycpuid);
2487 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2488 ("trackexp is not in progress"));
2490 /* Reply ASAP */
2491 netisr_replymsg(&nm->base, 0);
2493 anchor = &ctx->ipfw_trackexp_anch;
2494 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2495 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2496 ipfw_track_expire_done(ctx);
2497 return;
2499 ipfw_track_expire_loop(ctx, anchor,
2500 ipfw_track_scan_max, ipfw_track_expire_max);
2503 static void
2504 ipfw_track_expire_dispatch(netmsg_t nm)
2506 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2508 ASSERT_NETISR_NCPUS(mycpuid);
2510 /* Reply ASAP */
2511 crit_enter();
2512 netisr_replymsg(&nm->base, 0);
2513 crit_exit();
2515 if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2516 /* Running; done. */
2517 return;
2519 ipfw_track_expire_start(ctx,
2520 ipfw_track_scan_max, ipfw_track_expire_max);
2523 static void
2524 ipfw_track_expire_ipifunc(void *dummy __unused)
2526 struct netmsg_base *msg;
2528 KKASSERT(mycpuid < netisr_ncpus);
2529 msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2531 crit_enter();
2532 if (msg->lmsg.ms_flags & MSGF_DONE)
2533 netisr_sendmsg_oncpu(msg);
2534 crit_exit();
2537 static int
2538 ipfw_track_reap(struct ipfw_context *ctx)
2540 struct ipfw_track *t, *anchor;
2541 int expired;
2543 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2545 * Kick start track expiring. Ignore scan limit,
2546 * we are short of tracks.
2548 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2549 expired = ipfw_track_expire_start(ctx, INT_MAX,
2550 ipfw_track_reap_max);
2551 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2552 return (expired);
2556 * Tracks are being expired.
2559 if (RB_EMPTY(&ctx->ipfw_track_tree))
2560 return (0);
2562 expired = 0;
2563 anchor = &ctx->ipfw_trackexp_anch;
2564 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2566 * Ignore scan limit; we are short of tracks.
2569 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2570 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2572 if (t->t_count == NULL) /* anchor */
2573 continue;
2575 ipfw_track_state_expire(ctx, t, TRUE);
2576 if (!LIST_EMPTY(&t->t_state_list)) {
2577 /* There are states referencing this track. */
2578 continue;
2581 if (ipfw_track_free(ctx, t)) {
2582 if (++expired >= ipfw_track_reap_max) {
2583 ipfw_track_expire_more(ctx);
2584 break;
2589 * NOTE:
2590 * Leave the anchor on the list, even if the end of the list has
2591 * been reached. ipfw_track_expire_more_dispatch() will handle
2592 * the removal.
2594 return (expired);
2597 static struct ipfw_track *
2598 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2599 uint16_t limit_mask, struct ip_fw *rule)
2601 struct ipfw_track *key, *t, *dup;
2602 struct ipfw_trkcnt *trk, *ret;
2603 boolean_t do_expire = FALSE;
2605 KASSERT(rule->track_ruleid != 0,
2606 ("rule %u has no track ruleid", rule->rulenum));
2608 key = &ctx->ipfw_track_tmpkey;
2609 key->t_proto = id->proto;
2610 key->t_addrs = 0;
2611 key->t_ports = 0;
2612 key->t_rule = rule;
2613 if (limit_mask & DYN_SRC_ADDR)
2614 key->t_saddr = id->src_ip;
2615 if (limit_mask & DYN_DST_ADDR)
2616 key->t_daddr = id->dst_ip;
2617 if (limit_mask & DYN_SRC_PORT)
2618 key->t_sport = id->src_port;
2619 if (limit_mask & DYN_DST_PORT)
2620 key->t_dport = id->dst_port;
2622 t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2623 if (t != NULL)
2624 goto done;
2626 t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2627 if (t == NULL) {
2628 ctx->ipfw_tks_nomem++;
2629 return (NULL);
2632 t->t_key = key->t_key;
2633 t->t_rule = rule;
2634 t->t_lastexp = 0;
2635 LIST_INIT(&t->t_state_list);
2637 if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2638 time_t globexp, uptime;
2640 trk = NULL;
2641 do_expire = TRUE;
2644 * Do not expire globally more than once per second,
2645 * it is useless.
2647 uptime = time_uptime;
2648 globexp = ipfw_gd.ipfw_track_globexp;
2649 if (globexp != uptime &&
2650 atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2651 globexp, uptime)) {
2652 int cpu;
2654 /* Expire tracks on other CPUs. */
2655 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2656 if (cpu == mycpuid)
2657 continue;
2658 lwkt_send_ipiq(globaldata_find(cpu),
2659 ipfw_track_expire_ipifunc, NULL);
2662 } else {
2663 trk = ipfw_trkcnt_alloc(ctx);
2665 if (trk == NULL) {
2666 struct ipfw_trkcnt *tkey;
2668 tkey = &ctx->ipfw_trkcnt_tmpkey;
2669 key = NULL; /* tkey overlaps key */
2671 tkey->tc_key = t->t_key;
2672 tkey->tc_ruleid = rule->track_ruleid;
2674 IPFW_TRKCNT_TOKGET;
2675 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2676 tkey);
2677 if (trk == NULL) {
2678 IPFW_TRKCNT_TOKREL;
2679 if (do_expire) {
2680 ctx->ipfw_tks_reap++;
2681 if (ipfw_track_reap(ctx) > 0) {
2682 if (ipfw_gd.ipfw_trkcnt_cnt <
2683 ipfw_track_max) {
2684 trk = ipfw_trkcnt_alloc(ctx);
2685 if (trk != NULL)
2686 goto install;
2687 ctx->ipfw_tks_cntnomem++;
2688 } else {
2689 ctx->ipfw_tks_overflow++;
2691 } else {
2692 ctx->ipfw_tks_reapfailed++;
2693 ctx->ipfw_tks_overflow++;
2695 } else {
2696 ctx->ipfw_tks_cntnomem++;
2698 kfree(t, M_IPFW);
2699 return (NULL);
2701 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2702 ("invalid trkcnt refs %d", trk->tc_refs));
2703 atomic_add_int(&trk->tc_refs, 1);
2704 IPFW_TRKCNT_TOKREL;
2705 } else {
2706 install:
2707 trk->tc_key = t->t_key;
2708 trk->tc_ruleid = rule->track_ruleid;
2709 trk->tc_refs = 0;
2710 trk->tc_count = 0;
2711 trk->tc_expire = 0;
2712 trk->tc_rulenum = rule->rulenum;
2714 IPFW_TRKCNT_TOKGET;
2715 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2716 trk);
2717 if (ret != NULL) {
2718 KASSERT(ret->tc_refs > 0 &&
2719 ret->tc_refs < netisr_ncpus,
2720 ("invalid trkcnt refs %d", ret->tc_refs));
2721 KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2722 ("trkcnt spare was installed"));
2723 ctx->ipfw_trkcnt_spare = trk;
2724 trk = ret;
2725 } else {
2726 ipfw_gd.ipfw_trkcnt_cnt++;
2728 atomic_add_int(&trk->tc_refs, 1);
2729 IPFW_TRKCNT_TOKREL;
2731 t->t_count = &trk->tc_count;
2732 t->t_trkcnt = trk;
2734 dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2735 if (dup != NULL)
2736 panic("ipfw: track exists");
2737 TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2738 done:
2739 t->t_expire = time_uptime + dyn_short_lifetime;
2740 return (t);
2744 * Install state for rule type cmd->o.opcode
2746 * Returns NULL if state is not installed because of errors or because
2747 * states limitations are enforced.
2749 static struct ipfw_state *
2750 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2751 ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2753 struct ipfw_state *s;
2754 struct ipfw_track *t;
2755 int count, diff;
2757 if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2758 (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2759 boolean_t overflow = TRUE;
2761 ctx->ipfw_sts_reap++;
2762 if (ipfw_state_reap(ctx, diff) == 0)
2763 ctx->ipfw_sts_reapfailed++;
2764 if (ipfw_state_cntsync() < ipfw_state_max)
2765 overflow = FALSE;
2767 if (overflow) {
2768 time_t globexp, uptime;
2769 int cpu;
2772 * Do not expire globally more than once per second,
2773 * it is useless.
2775 uptime = time_uptime;
2776 globexp = ipfw_gd.ipfw_state_globexp;
2777 if (globexp == uptime ||
2778 !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2779 globexp, uptime)) {
2780 ctx->ipfw_sts_overflow++;
2781 return (NULL);
2784 /* Expire states on other CPUs. */
2785 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2786 if (cpu == mycpuid)
2787 continue;
2788 lwkt_send_ipiq(globaldata_find(cpu),
2789 ipfw_state_expire_ipifunc, NULL);
2791 ctx->ipfw_sts_overflow++;
2792 return (NULL);
2796 switch (cmd->o.opcode) {
2797 case O_KEEP_STATE: /* bidir rule */
2798 case O_REDIRECT:
2799 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2800 tcp);
2801 if (s == NULL)
2802 return (NULL);
2803 break;
2805 case O_LIMIT: /* limit number of sessions */
2806 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2807 if (t == NULL)
2808 return (NULL);
2810 if (*t->t_count >= cmd->conn_limit) {
2811 if (!ipfw_track_state_expire(ctx, t, TRUE))
2812 return (NULL);
2814 for (;;) {
2815 count = *t->t_count;
2816 if (count >= cmd->conn_limit)
2817 return (NULL);
2818 if (atomic_cmpset_int(t->t_count, count, count + 1))
2819 break;
2822 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2823 if (s == NULL) {
2824 /* Undo damage. */
2825 atomic_subtract_int(t->t_count, 1);
2826 return (NULL);
2828 break;
2830 default:
2831 panic("unknown state type %u\n", cmd->o.opcode);
2834 if (s->st_type == O_REDIRECT) {
2835 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2836 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2838 x->xlat_addr = r->addr.s_addr;
2839 x->xlat_port = r->port;
2840 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2841 x->xlat_dir = MATCH_FORWARD;
2842 KKASSERT(x->xlat_ifp != NULL);
2844 return (s);
2847 static int
2848 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2849 const struct in_addr *in)
2851 struct radix_node_head *rnh;
2852 struct sockaddr_in sin;
2853 struct ipfw_tblent *te;
2855 KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2856 rnh = ctx->ipfw_tables[tableid];
2857 if (rnh == NULL)
2858 return (0); /* no match */
2860 memset(&sin, 0, sizeof(sin));
2861 sin.sin_family = AF_INET;
2862 sin.sin_len = sizeof(sin);
2863 sin.sin_addr = *in;
2865 te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2866 if (te == NULL)
2867 return (0); /* no match */
2869 te->te_use++;
2870 te->te_lastuse = time_second;
2871 return (1); /* match */
2875 * Transmit a TCP packet, containing either a RST or a keepalive.
2876 * When flags & TH_RST, we are sending a RST packet, because of a
2877 * "reset" action matched the packet.
2878 * Otherwise we are sending a keepalive, and flags & TH_
2880 * Only {src,dst}_{ip,port} of "id" are used.
2882 static void
2883 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2885 struct mbuf *m;
2886 struct ip *ip;
2887 struct tcphdr *tcp;
2888 struct route sro; /* fake route */
2890 MGETHDR(m, M_NOWAIT, MT_HEADER);
2891 if (m == NULL)
2892 return;
2893 m->m_pkthdr.rcvif = NULL;
2894 m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2895 m->m_data += max_linkhdr;
2897 ip = mtod(m, struct ip *);
2898 bzero(ip, m->m_len);
2899 tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2900 ip->ip_p = IPPROTO_TCP;
2901 tcp->th_off = 5;
2904 * Assume we are sending a RST (or a keepalive in the reverse
2905 * direction), swap src and destination addresses and ports.
2907 ip->ip_src.s_addr = htonl(id->dst_ip);
2908 ip->ip_dst.s_addr = htonl(id->src_ip);
2909 tcp->th_sport = htons(id->dst_port);
2910 tcp->th_dport = htons(id->src_port);
2911 if (flags & TH_RST) { /* we are sending a RST */
2912 if (flags & TH_ACK) {
2913 tcp->th_seq = htonl(ack);
2914 tcp->th_ack = htonl(0);
2915 tcp->th_flags = TH_RST;
2916 } else {
2917 if (flags & TH_SYN)
2918 seq++;
2919 tcp->th_seq = htonl(0);
2920 tcp->th_ack = htonl(seq);
2921 tcp->th_flags = TH_RST | TH_ACK;
2923 } else {
2925 * We are sending a keepalive. flags & TH_SYN determines
2926 * the direction, forward if set, reverse if clear.
2927 * NOTE: seq and ack are always assumed to be correct
2928 * as set by the caller. This may be confusing...
2930 if (flags & TH_SYN) {
2932 * we have to rewrite the correct addresses!
2934 ip->ip_dst.s_addr = htonl(id->dst_ip);
2935 ip->ip_src.s_addr = htonl(id->src_ip);
2936 tcp->th_dport = htons(id->dst_port);
2937 tcp->th_sport = htons(id->src_port);
2939 tcp->th_seq = htonl(seq);
2940 tcp->th_ack = htonl(ack);
2941 tcp->th_flags = TH_ACK;
2945 * set ip_len to the payload size so we can compute
2946 * the tcp checksum on the pseudoheader
2947 * XXX check this, could save a couple of words ?
2949 ip->ip_len = htons(sizeof(struct tcphdr));
2950 tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2953 * now fill fields left out earlier
2955 ip->ip_ttl = ip_defttl;
2956 ip->ip_len = m->m_pkthdr.len;
2958 bzero(&sro, sizeof(sro));
2959 ip_rtaddr(ip->ip_dst, &sro);
2961 m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2962 ip_output(m, NULL, &sro, 0, NULL, NULL);
2963 if (sro.ro_rt)
2964 RTFREE(sro.ro_rt);
2968 * Send a reject message, consuming the mbuf passed as an argument.
2970 static void
2971 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2973 if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2974 /* We need the IP header in host order for icmp_error(). */
2975 if (args->eh != NULL) {
2976 struct ip *ip = mtod(args->m, struct ip *);
2978 ip->ip_len = ntohs(ip->ip_len);
2979 ip->ip_off = ntohs(ip->ip_off);
2981 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2982 } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2983 struct tcphdr *const tcp =
2984 L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2986 if ((tcp->th_flags & TH_RST) == 0) {
2987 send_pkt(&args->f_id, ntohl(tcp->th_seq),
2988 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2990 m_freem(args->m);
2991 } else {
2992 m_freem(args->m);
2994 args->m = NULL;
2998 * Given an ip_fw *, lookup_next_rule will return a pointer
2999 * to the next rule, which can be either the jump
3000 * target (for skipto instructions) or the next one in the list (in
3001 * all other cases including a missing jump target).
3002 * The result is also written in the "next_rule" field of the rule.
3003 * Backward jumps are not allowed, so start looking from the next
3004 * rule...
3006 * This never returns NULL -- in case we do not have an exact match,
3007 * the next rule is returned. When the ruleset is changed,
3008 * pointers are flushed so we are always correct.
3010 static struct ip_fw *
3011 lookup_next_rule(struct ip_fw *me)
3013 struct ip_fw *rule = NULL;
3014 ipfw_insn *cmd;
3016 /* look for action, in case it is a skipto */
3017 cmd = ACTION_PTR(me);
3018 if (cmd->opcode == O_LOG)
3019 cmd += F_LEN(cmd);
3020 if (cmd->opcode == O_SKIPTO) {
3021 for (rule = me->next; rule; rule = rule->next) {
3022 if (rule->rulenum >= cmd->arg1)
3023 break;
3026 if (rule == NULL) /* failure or not a skipto */
3027 rule = me->next;
3028 me->next_rule = rule;
3029 return rule;
3032 static int
3033 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3034 enum ipfw_opcodes opcode, uid_t uid)
3036 struct in_addr src_ip, dst_ip;
3037 struct inpcbinfo *pi;
3038 boolean_t wildcard;
3039 struct inpcb *pcb;
3041 if (fid->proto == IPPROTO_TCP) {
3042 wildcard = FALSE;
3043 pi = &tcbinfo[mycpuid];
3044 } else if (fid->proto == IPPROTO_UDP) {
3045 wildcard = TRUE;
3046 pi = &udbinfo[mycpuid];
3047 } else {
3048 return 0;
3052 * Values in 'fid' are in host byte order
3054 dst_ip.s_addr = htonl(fid->dst_ip);
3055 src_ip.s_addr = htonl(fid->src_ip);
3056 if (oif) {
3057 pcb = in_pcblookup_hash(pi,
3058 dst_ip, htons(fid->dst_port),
3059 src_ip, htons(fid->src_port),
3060 wildcard, oif);
3061 } else {
3062 pcb = in_pcblookup_hash(pi,
3063 src_ip, htons(fid->src_port),
3064 dst_ip, htons(fid->dst_port),
3065 wildcard, NULL);
3067 if (pcb == NULL || pcb->inp_socket == NULL)
3068 return 0;
3070 if (opcode == O_UID) {
3071 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3072 return !socheckuid(pcb->inp_socket, uid);
3073 #undef socheckuid
3074 } else {
3075 return groupmember(uid, pcb->inp_socket->so_cred);
3079 static int
3080 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3083 if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3084 struct ifaddr_container *ifac;
3085 struct ifnet *ifp;
3087 ifp = ifunit_netisr(cmd->ifname);
3088 if (ifp == NULL)
3089 return (0);
3091 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3092 struct ifaddr *ia = ifac->ifa;
3094 if (ia->ifa_addr == NULL)
3095 continue;
3096 if (ia->ifa_addr->sa_family != AF_INET)
3097 continue;
3099 cmd->mask.s_addr = INADDR_ANY;
3100 if (cmd->o.arg1 & IPFW_IFIP_NET) {
3101 cmd->mask = ((struct sockaddr_in *)
3102 ia->ifa_netmask)->sin_addr;
3104 if (cmd->mask.s_addr == INADDR_ANY)
3105 cmd->mask.s_addr = INADDR_BROADCAST;
3107 cmd->addr =
3108 ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3109 cmd->addr.s_addr &= cmd->mask.s_addr;
3111 cmd->o.arg1 |= IPFW_IFIP_VALID;
3112 break;
3114 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3115 return (0);
3117 return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3120 static void
3121 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3122 struct in_addr *old_addr, uint16_t *old_port)
3124 struct ip *ip = mtod(m, struct ip *);
3125 struct in_addr *addr;
3126 uint16_t *port, *csum, dlen = 0;
3127 uint8_t udp = 0;
3128 boolean_t pseudo = FALSE;
3130 if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3131 addr = &ip->ip_src;
3132 switch (ip->ip_p) {
3133 case IPPROTO_TCP:
3134 port = &L3HDR(struct tcphdr, ip)->th_sport;
3135 csum = &L3HDR(struct tcphdr, ip)->th_sum;
3136 break;
3137 case IPPROTO_UDP:
3138 port = &L3HDR(struct udphdr, ip)->uh_sport;
3139 csum = &L3HDR(struct udphdr, ip)->uh_sum;
3140 udp = 1;
3141 break;
3142 default:
3143 panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3145 } else {
3146 addr = &ip->ip_dst;
3147 switch (ip->ip_p) {
3148 case IPPROTO_TCP:
3149 port = &L3HDR(struct tcphdr, ip)->th_dport;
3150 csum = &L3HDR(struct tcphdr, ip)->th_sum;
3151 break;
3152 case IPPROTO_UDP:
3153 port = &L3HDR(struct udphdr, ip)->uh_dport;
3154 csum = &L3HDR(struct udphdr, ip)->uh_sum;
3155 udp = 1;
3156 break;
3157 default:
3158 panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3161 if (old_addr != NULL)
3162 *old_addr = *addr;
3163 if (old_port != NULL) {
3164 if (x->xlat_port != 0)
3165 *old_port = *port;
3166 else
3167 *old_port = 0;
3170 if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3171 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3172 dlen = ip->ip_len - (ip->ip_hl << 2);
3173 pseudo = TRUE;
3176 if (!pseudo) {
3177 const uint16_t *oaddr, *naddr;
3179 oaddr = (const uint16_t *)&addr->s_addr;
3180 naddr = (const uint16_t *)&x->xlat_addr;
3182 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3183 oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3184 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3185 oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3187 addr->s_addr = x->xlat_addr;
3189 if (x->xlat_port != 0) {
3190 if (!pseudo) {
3191 *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3192 udp);
3194 *port = x->xlat_port;
3197 if (pseudo) {
3198 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3199 htons(dlen + ip->ip_p));
3203 static void
3204 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3206 struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3207 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3208 struct mbuf *m = nm->m;
3209 struct ipfw_xlat *x = nm->arg1;
3210 struct ip_fw *rule = x->xlat_rule;
3212 ASSERT_NETISR_NCPUS(mycpuid);
3213 KASSERT(rule->cpuid == mycpuid,
3214 ("rule does not belong to cpu%d", mycpuid));
3215 KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3216 ("mbuf does not have ipfw continue rule"));
3218 KASSERT(ctx->ipfw_cont_rule == NULL,
3219 ("pending ipfw continue rule"));
3220 KASSERT(ctx->ipfw_cont_xlat == NULL,
3221 ("pending ipfw continue xlat"));
3222 ctx->ipfw_cont_rule = rule;
3223 ctx->ipfw_cont_xlat = x;
3225 if (nm->arg2 == 0)
3226 ip_input(m);
3227 else
3228 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3230 /* May not be cleared, if ipfw was unload/disabled. */
3231 ctx->ipfw_cont_rule = NULL;
3232 ctx->ipfw_cont_xlat = NULL;
3235 * This state is no longer used; decrement its xlat_crefs,
3236 * so this state can be deleted.
3238 x->xlat_crefs--;
3240 * This rule is no longer used; decrement its cross_refs,
3241 * so this rule can be deleted.
3243 * NOTE:
3244 * Decrement cross_refs in the last step of this function,
3245 * so that the module could be unloaded safely.
3247 rule->cross_refs--;
3250 static void
3251 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3252 uint32_t flags)
3254 struct netmsg_genpkt *nm;
3256 KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3257 x->xlat_pcpu, cpuid));
3260 * Bump cross_refs to prevent this rule and its siblings
3261 * from being deleted, while this mbuf is inflight. The
3262 * cross_refs of the sibling rule on the target cpu will
3263 * be decremented, once this mbuf is going to be filtered
3264 * on the target cpu.
3266 x->xlat_rule->cross_refs++;
3268 * Bump xlat_crefs to prevent this state and its paired
3269 * state from being deleted, while this mbuf is inflight.
3270 * The xlat_crefs of the paired state on the target cpu
3271 * will be decremented, once this mbuf is going to be
3272 * filtered on the target cpu.
3274 x->xlat_crefs++;
3276 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3277 if (flags & IPFW_XLATE_INSERT)
3278 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3279 if (flags & IPFW_XLATE_FORWARD)
3280 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3282 if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3283 struct ip *ip = mtod(m, struct ip *);
3286 * NOTE:
3287 * ip_input() expects ip_len/ip_off are in network
3288 * byte order.
3290 ip->ip_len = htons(ip->ip_len);
3291 ip->ip_off = htons(ip->ip_off);
3294 nm = &m->m_hdr.mh_genmsg;
3295 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3296 ipfw_ip_xlate_dispatch);
3297 nm->m = m;
3298 nm->arg1 = x->xlat_pair;
3299 nm->arg2 = 0;
3300 if (flags & IPFW_XLATE_OUTPUT)
3301 nm->arg2 = 1;
3302 netisr_sendmsg(&nm->base, cpuid);
3305 static struct mbuf *
3306 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3307 struct ip_fw_local *local, struct ip **ip0)
3309 struct ip *ip = mtod(m, struct ip *);
3310 struct tcphdr *tcp;
3311 struct udphdr *udp;
3314 * Collect parameters into local variables for faster matching.
3316 if (hlen == 0) { /* do not grab addresses for non-ip pkts */
3317 local->proto = args->f_id.proto = 0; /* mark f_id invalid */
3318 goto done;
3321 local->proto = args->f_id.proto = ip->ip_p;
3322 local->src_ip = ip->ip_src;
3323 local->dst_ip = ip->ip_dst;
3324 if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3325 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3326 local->ip_len = ntohs(ip->ip_len);
3327 } else {
3328 local->offset = ip->ip_off & IP_OFFMASK;
3329 local->ip_len = ip->ip_len;
3332 #define PULLUP_TO(len) \
3333 do { \
3334 if (m->m_len < (len)) { \
3335 args->m = m = m_pullup(m, (len)); \
3336 if (m == NULL) { \
3337 ip = NULL; \
3338 goto done; \
3340 ip = mtod(m, struct ip *); \
3342 } while (0)
3344 if (local->offset == 0) {
3345 switch (local->proto) {
3346 case IPPROTO_TCP:
3347 PULLUP_TO(hlen + sizeof(struct tcphdr));
3348 local->tcp = tcp = L3HDR(struct tcphdr, ip);
3349 local->dst_port = tcp->th_dport;
3350 local->src_port = tcp->th_sport;
3351 args->f_id.flags = tcp->th_flags;
3352 break;
3354 case IPPROTO_UDP:
3355 PULLUP_TO(hlen + sizeof(struct udphdr));
3356 udp = L3HDR(struct udphdr, ip);
3357 local->dst_port = udp->uh_dport;
3358 local->src_port = udp->uh_sport;
3359 break;
3361 case IPPROTO_ICMP:
3362 PULLUP_TO(hlen + 4); /* type, code and checksum. */
3363 args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3364 break;
3366 default:
3367 break;
3371 #undef PULLUP_TO
3373 args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3374 args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3375 args->f_id.src_port = local->src_port = ntohs(local->src_port);
3376 args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3377 done:
3378 *ip0 = ip;
3379 return (m);
3382 static struct mbuf *
3383 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3384 struct ip_fw_local *local, struct ip **ip0)
3386 struct ip *ip = mtod(m, struct ip *);
3388 ip->ip_len = htons(ip->ip_len);
3389 ip->ip_off = htons(ip->ip_off);
3391 m->m_flags &= ~M_HASH;
3392 ip_hashfn(&m, 0);
3393 args->m = m;
3394 if (m == NULL) {
3395 *ip0 = NULL;
3396 return (NULL);
3398 KASSERT(m->m_flags & M_HASH, ("no hash"));
3400 /* 'm' might be changed by ip_hashfn(). */
3401 ip = mtod(m, struct ip *);
3402 ip->ip_len = ntohs(ip->ip_len);
3403 ip->ip_off = ntohs(ip->ip_off);
3405 return (ipfw_setup_local(m, hlen, args, local, ip0));
3409 * The main check routine for the firewall.
3411 * All arguments are in args so we can modify them and return them
3412 * back to the caller.
3414 * Parameters:
3416 * args->m (in/out) The packet; we set to NULL when/if we nuke it.
3417 * Starts with the IP header.
3418 * args->eh (in) Mac header if present, or NULL for layer3 packet.
3419 * args->oif Outgoing interface, or NULL if packet is incoming.
3420 * The incoming interface is in the mbuf. (in)
3422 * args->rule Pointer to the last matching rule (in/out)
3423 * args->f_id Addresses grabbed from the packet (out)
3425 * Return value:
3427 * If the packet was denied/rejected and has been dropped, *m is equal
3428 * to NULL upon return.
3430 * IP_FW_DENY the packet must be dropped.
3431 * IP_FW_PASS The packet is to be accepted and routed normally.
3432 * IP_FW_DIVERT Divert the packet to port (args->cookie)
3433 * IP_FW_TEE Tee the packet to port (args->cookie)
3434 * IP_FW_DUMMYNET Send the packet to pipe/queue (args->cookie)
3435 * IP_FW_CONTINUE Continue processing on another cpu.
3437 static int
3438 ipfw_chk(struct ip_fw_args *args)
3441 * Local variables hold state during the processing of a packet.
3443 * IMPORTANT NOTE: to speed up the processing of rules, there
3444 * are some assumption on the values of the variables, which
3445 * are documented here. Should you change them, please check
3446 * the implementation of the various instructions to make sure
3447 * that they still work.
3449 * args->eh The MAC header. It is non-null for a layer2
3450 * packet, it is NULL for a layer-3 packet.
3452 * m | args->m Pointer to the mbuf, as received from the caller.
3453 * It may change if ipfw_chk() does an m_pullup, or if it
3454 * consumes the packet because it calls send_reject().
3455 * XXX This has to change, so that ipfw_chk() never modifies
3456 * or consumes the buffer.
3457 * ip is simply an alias of the value of m, and it is kept
3458 * in sync with it (the packet is supposed to start with
3459 * the ip header).
3461 struct mbuf *m = args->m;
3462 struct ip *ip = mtod(m, struct ip *);
3465 * oif | args->oif If NULL, ipfw_chk has been called on the
3466 * inbound path (ether_input, ip_input).
3467 * If non-NULL, ipfw_chk has been called on the outbound path
3468 * (ether_output, ip_output).
3470 struct ifnet *oif = args->oif;
3472 struct ip_fw *f = NULL; /* matching rule */
3473 int retval = IP_FW_PASS;
3474 struct m_tag *mtag;
3475 struct divert_info *divinfo;
3476 struct ipfw_state *s;
3479 * hlen The length of the IPv4 header.
3480 * hlen >0 means we have an IPv4 packet.
3482 u_int hlen = 0; /* hlen >0 means we have an IP pkt */
3484 struct ip_fw_local lc;
3487 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3488 * MATCH_NONE when checked and not matched (dyn_f = NULL),
3489 * MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3491 int dyn_dir = MATCH_UNKNOWN;
3492 struct ip_fw *dyn_f = NULL;
3493 int cpuid = mycpuid;
3494 struct ipfw_context *ctx;
3496 ASSERT_NETISR_NCPUS(cpuid);
3497 ctx = ipfw_ctx[cpuid];
3499 if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3500 return IP_FW_PASS; /* accept */
3502 if (args->eh == NULL || /* layer 3 packet */
3503 (m->m_pkthdr.len >= sizeof(struct ip) &&
3504 ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3505 hlen = ip->ip_hl << 2;
3507 memset(&lc, 0, sizeof(lc));
3509 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3510 if (m == NULL)
3511 goto pullup_failed;
3513 if (args->rule) {
3515 * Packet has already been tagged. Look for the next rule
3516 * to restart processing.
3518 * If fw_one_pass != 0 then just accept it.
3519 * XXX should not happen here, but optimized out in
3520 * the caller.
3522 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3523 return IP_FW_PASS;
3524 args->flags &= ~IP_FWARG_F_CONT;
3526 /* This rule is being/has been flushed */
3527 if (ipfw_flushing)
3528 return IP_FW_DENY;
3530 KASSERT(args->rule->cpuid == cpuid,
3531 ("rule used on cpu%d", cpuid));
3533 /* This rule was deleted */
3534 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3535 return IP_FW_DENY;
3537 if (args->xlat != NULL) {
3538 struct ipfw_xlat *x = args->xlat;
3540 /* This xlat is being deleted. */
3541 if (x->xlat_invalid)
3542 return IP_FW_DENY;
3544 f = args->rule;
3546 dyn_f = f;
3547 dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3548 MATCH_FORWARD : MATCH_REVERSE;
3550 if (args->flags & IP_FWARG_F_XLATINS) {
3551 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3552 ("not slave %u state", x->xlat_type));
3553 s = ipfw_state_link(ctx, &x->xlat_st);
3554 if (s != NULL) {
3555 ctx->ipfw_xlate_conflicts++;
3556 if (IPFW_STATE_ISDEAD(s)) {
3557 ipfw_state_remove(ctx, s);
3558 s = ipfw_state_link(ctx,
3559 &x->xlat_st);
3561 if (s != NULL) {
3562 if (bootverbose) {
3563 kprintf("ipfw: "
3564 "slave %u state "
3565 "conflicts %u state\n",
3566 x->xlat_type,
3567 s->st_type);
3569 ipfw_xlat_invalidate(x);
3570 return IP_FW_DENY;
3572 ctx->ipfw_xlate_cresolved++;
3574 } else {
3575 ipfw_state_update(&args->f_id, dyn_dir,
3576 lc.tcp, &x->xlat_st);
3578 } else {
3579 /* TODO: setup dyn_f, dyn_dir */
3581 f = args->rule->next_rule;
3582 if (f == NULL)
3583 f = lookup_next_rule(args->rule);
3585 } else {
3587 * Find the starting rule. It can be either the first
3588 * one, or the one after divert_rule if asked so.
3590 int skipto;
3592 KKASSERT((args->flags &
3593 (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3594 KKASSERT(args->xlat == NULL);
3596 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3597 if (mtag != NULL) {
3598 divinfo = m_tag_data(mtag);
3599 skipto = divinfo->skipto;
3600 } else {
3601 skipto = 0;
3604 f = ctx->ipfw_layer3_chain;
3605 if (args->eh == NULL && skipto != 0) {
3606 /* No skipto during rule flushing */
3607 if (ipfw_flushing)
3608 return IP_FW_DENY;
3610 if (skipto >= IPFW_DEFAULT_RULE)
3611 return IP_FW_DENY; /* invalid */
3613 while (f && f->rulenum <= skipto)
3614 f = f->next;
3615 if (f == NULL) /* drop packet */
3616 return IP_FW_DENY;
3617 } else if (ipfw_flushing) {
3618 /* Rules are being flushed; skip to default rule */
3619 f = ctx->ipfw_default_rule;
3622 if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3623 m_tag_delete(m, mtag);
3626 * Now scan the rules, and parse microinstructions for each rule.
3628 for (; f; f = f->next) {
3629 int l, cmdlen;
3630 ipfw_insn *cmd;
3631 int skip_or; /* skip rest of OR block */
3633 again:
3634 if (ctx->ipfw_set_disable & (1 << f->set)) {
3635 args->xlat = NULL;
3636 continue;
3639 if (args->xlat != NULL) {
3640 args->xlat = NULL;
3641 l = f->cmd_len - f->act_ofs;
3642 cmd = ACTION_PTR(f);
3643 } else {
3644 l = f->cmd_len;
3645 cmd = f->cmd;
3648 skip_or = 0;
3649 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3650 int match;
3653 * check_body is a jump target used when we find a
3654 * CHECK_STATE, and need to jump to the body of
3655 * the target rule.
3657 check_body:
3658 cmdlen = F_LEN(cmd);
3660 * An OR block (insn_1 || .. || insn_n) has the
3661 * F_OR bit set in all but the last instruction.
3662 * The first match will set "skip_or", and cause
3663 * the following instructions to be skipped until
3664 * past the one with the F_OR bit clear.
3666 if (skip_or) { /* skip this instruction */
3667 if ((cmd->len & F_OR) == 0)
3668 skip_or = 0; /* next one is good */
3669 continue;
3671 match = 0; /* set to 1 if we succeed */
3673 switch (cmd->opcode) {
3675 * The first set of opcodes compares the packet's
3676 * fields with some pattern, setting 'match' if a
3677 * match is found. At the end of the loop there is
3678 * logic to deal with F_NOT and F_OR flags associated
3679 * with the opcode.
3681 case O_NOP:
3682 match = 1;
3683 break;
3685 case O_FORWARD_MAC:
3686 kprintf("ipfw: opcode %d unimplemented\n",
3687 cmd->opcode);
3688 break;
3690 case O_GID:
3691 case O_UID:
3693 * We only check offset == 0 && proto != 0,
3694 * as this ensures that we have an IPv4
3695 * packet with the ports info.
3697 if (lc.offset!=0)
3698 break;
3700 match = ipfw_match_uid(&args->f_id, oif,
3701 cmd->opcode,
3702 (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3703 break;
3705 case O_RECV:
3706 match = iface_match(m->m_pkthdr.rcvif,
3707 (ipfw_insn_if *)cmd);
3708 break;
3710 case O_XMIT:
3711 match = iface_match(oif, (ipfw_insn_if *)cmd);
3712 break;
3714 case O_VIA:
3715 match = iface_match(oif ? oif :
3716 m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3717 break;
3719 case O_MACADDR2:
3720 if (args->eh != NULL) { /* have MAC header */
3721 uint32_t *want = (uint32_t *)
3722 ((ipfw_insn_mac *)cmd)->addr;
3723 uint32_t *mask = (uint32_t *)
3724 ((ipfw_insn_mac *)cmd)->mask;
3725 uint32_t *hdr = (uint32_t *)args->eh;
3727 match =
3728 (want[0] == (hdr[0] & mask[0]) &&
3729 want[1] == (hdr[1] & mask[1]) &&
3730 want[2] == (hdr[2] & mask[2]));
3732 break;
3734 case O_MAC_TYPE:
3735 if (args->eh != NULL) {
3736 uint16_t t =
3737 ntohs(args->eh->ether_type);
3738 uint16_t *p =
3739 ((ipfw_insn_u16 *)cmd)->ports;
3740 int i;
3742 /* Special vlan handling */
3743 if (m->m_flags & M_VLANTAG)
3744 t = ETHERTYPE_VLAN;
3746 for (i = cmdlen - 1; !match && i > 0;
3747 i--, p += 2) {
3748 match =
3749 (t >= p[0] && t <= p[1]);
3752 break;
3754 case O_FRAG:
3755 match = (hlen > 0 && lc.offset != 0);
3756 break;
3758 case O_IPFRAG:
3759 if (hlen > 0) {
3760 uint16_t off;
3762 if (args->eh != NULL)
3763 off = ntohs(ip->ip_off);
3764 else
3765 off = ip->ip_off;
3766 if (off & (IP_MF | IP_OFFMASK))
3767 match = 1;
3769 break;
3771 case O_IN: /* "out" is "not in" */
3772 match = (oif == NULL);
3773 break;
3775 case O_LAYER2:
3776 match = (args->eh != NULL);
3777 break;
3779 case O_PROTO:
3781 * We do not allow an arg of 0 so the
3782 * check of "proto" only suffices.
3784 match = (lc.proto == cmd->arg1);
3785 break;
3787 case O_IP_SRC:
3788 match = (hlen > 0 &&
3789 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3790 lc.src_ip.s_addr);
3791 break;
3793 case O_IP_SRC_MASK:
3794 match = (hlen > 0 &&
3795 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3796 (lc.src_ip.s_addr &
3797 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3798 break;
3800 case O_IP_SRC_ME:
3801 if (hlen > 0) {
3802 struct ifnet *tif;
3804 tif = INADDR_TO_IFP(&lc.src_ip);
3805 match = (tif != NULL);
3807 break;
3809 case O_IP_SRC_TABLE:
3810 match = ipfw_table_lookup(ctx, cmd->arg1,
3811 &lc.src_ip);
3812 break;
3814 case O_IP_SRC_IFIP:
3815 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3816 &lc.src_ip);
3817 break;
3819 case O_IP_DST_SET:
3820 case O_IP_SRC_SET:
3821 if (hlen > 0) {
3822 uint32_t *d = (uint32_t *)(cmd + 1);
3823 uint32_t addr =
3824 cmd->opcode == O_IP_DST_SET ?
3825 args->f_id.dst_ip :
3826 args->f_id.src_ip;
3828 if (addr < d[0])
3829 break;
3830 addr -= d[0]; /* subtract base */
3831 match =
3832 (addr < cmd->arg1) &&
3833 (d[1 + (addr >> 5)] &
3834 (1 << (addr & 0x1f)));
3836 break;
3838 case O_IP_DST:
3839 match = (hlen > 0 &&
3840 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3841 lc.dst_ip.s_addr);
3842 break;
3844 case O_IP_DST_MASK:
3845 match = (hlen > 0) &&
3846 (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3847 (lc.dst_ip.s_addr &
3848 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3849 break;
3851 case O_IP_DST_ME:
3852 if (hlen > 0) {
3853 struct ifnet *tif;
3855 tif = INADDR_TO_IFP(&lc.dst_ip);
3856 match = (tif != NULL);
3858 break;
3860 case O_IP_DST_TABLE:
3861 match = ipfw_table_lookup(ctx, cmd->arg1,
3862 &lc.dst_ip);
3863 break;
3865 case O_IP_DST_IFIP:
3866 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3867 &lc.dst_ip);
3868 break;
3870 case O_IP_SRCPORT:
3871 case O_IP_DSTPORT:
3873 * offset == 0 && proto != 0 is enough
3874 * to guarantee that we have an IPv4
3875 * packet with port info.
3877 if ((lc.proto==IPPROTO_UDP ||
3878 lc.proto==IPPROTO_TCP)
3879 && lc.offset == 0) {
3880 uint16_t x =
3881 (cmd->opcode == O_IP_SRCPORT) ?
3882 lc.src_port : lc.dst_port;
3883 uint16_t *p =
3884 ((ipfw_insn_u16 *)cmd)->ports;
3885 int i;
3887 for (i = cmdlen - 1; !match && i > 0;
3888 i--, p += 2) {
3889 match =
3890 (x >= p[0] && x <= p[1]);
3893 break;
3895 case O_ICMPCODE:
3896 match = (lc.offset == 0 &&
3897 lc.proto==IPPROTO_ICMP &&
3898 icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3899 break;
3901 case O_ICMPTYPE:
3902 match = (lc.offset == 0 &&
3903 lc.proto==IPPROTO_ICMP &&
3904 icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3905 break;
3907 case O_IPOPT:
3908 match = (hlen > 0 && ipopts_match(ip, cmd));
3909 break;
3911 case O_IPVER:
3912 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3913 break;
3915 case O_IPTTL:
3916 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3917 break;
3919 case O_IPID:
3920 match = (hlen > 0 &&
3921 cmd->arg1 == ntohs(ip->ip_id));
3922 break;
3924 case O_IPLEN:
3925 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3926 break;
3928 case O_IPPRECEDENCE:
3929 match = (hlen > 0 &&
3930 (cmd->arg1 == (ip->ip_tos & 0xe0)));
3931 break;
3933 case O_IPTOS:
3934 match = (hlen > 0 &&
3935 flags_match(cmd, ip->ip_tos));
3936 break;
3938 case O_TCPFLAGS:
3939 match = (lc.proto == IPPROTO_TCP &&
3940 lc.offset == 0 &&
3941 flags_match(cmd,
3942 L3HDR(struct tcphdr,ip)->th_flags));
3943 break;
3945 case O_TCPOPTS:
3946 match = (lc.proto == IPPROTO_TCP &&
3947 lc.offset == 0 && tcpopts_match(ip, cmd));
3948 break;
3950 case O_TCPSEQ:
3951 match = (lc.proto == IPPROTO_TCP &&
3952 lc.offset == 0 &&
3953 ((ipfw_insn_u32 *)cmd)->d[0] ==
3954 L3HDR(struct tcphdr,ip)->th_seq);
3955 break;
3957 case O_TCPACK:
3958 match = (lc.proto == IPPROTO_TCP &&
3959 lc.offset == 0 &&
3960 ((ipfw_insn_u32 *)cmd)->d[0] ==
3961 L3HDR(struct tcphdr,ip)->th_ack);
3962 break;
3964 case O_TCPWIN:
3965 match = (lc.proto == IPPROTO_TCP &&
3966 lc.offset == 0 &&
3967 cmd->arg1 ==
3968 L3HDR(struct tcphdr,ip)->th_win);
3969 break;
3971 case O_ESTAB:
3972 /* reject packets which have SYN only */
3973 /* XXX should i also check for TH_ACK ? */
3974 match = (lc.proto == IPPROTO_TCP &&
3975 lc.offset == 0 &&
3976 (L3HDR(struct tcphdr,ip)->th_flags &
3977 (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3978 break;
3980 case O_LOG:
3981 if (fw_verbose) {
3982 ipfw_log(ctx, f, hlen, args->eh, m,
3983 oif);
3985 match = 1;
3986 break;
3988 case O_PROB:
3989 match = (krandom() <
3990 ((ipfw_insn_u32 *)cmd)->d[0]);
3991 break;
3994 * The second set of opcodes represents 'actions',
3995 * i.e. the terminal part of a rule once the packet
3996 * matches all previous patterns.
3997 * Typically there is only one action for each rule,
3998 * and the opcode is stored at the end of the rule
3999 * (but there are exceptions -- see below).
4001 * In general, here we set retval and terminate the
4002 * outer loop (would be a 'break 3' in some language,
4003 * but we need to do a 'goto done').
4005 * Exceptions:
4006 * O_COUNT and O_SKIPTO actions:
4007 * instead of terminating, we jump to the next rule
4008 * ('goto next_rule', equivalent to a 'break 2'),
4009 * or to the SKIPTO target ('goto again' after
4010 * having set f, cmd and l), respectively.
4012 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
4013 * are not real 'actions', and are stored right
4014 * before the 'action' part of the rule.
4015 * These opcodes try to install an entry in the
4016 * state tables; if successful, we continue with
4017 * the next opcode (match=1; break;), otherwise
4018 * the packet must be dropped ('goto done' after
4019 * setting retval). If static rules are changed
4020 * during the state installation, the packet will
4021 * be dropped and rule's stats will not beupdated
4022 * ('return IP_FW_DENY').
4024 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4025 * cause a lookup of the state table, and a jump
4026 * to the 'action' part of the parent rule
4027 * ('goto check_body') if an entry is found, or
4028 * (CHECK_STATE only) a jump to the next rule if
4029 * the entry is not found ('goto next_rule').
4030 * The result of the lookup is cached to make
4031 * further instances of these opcodes are
4032 * effectively NOPs. If static rules are changed
4033 * during the state looking up, the packet will
4034 * be dropped and rule's stats will not be updated
4035 * ('return IP_FW_DENY').
4037 case O_REDIRECT:
4038 if (f->cross_rules == NULL) {
4040 * This rule was not completely setup;
4041 * move on to the next rule.
4043 goto next_rule;
4046 * Apply redirect only on input path and
4047 * only to non-fragment TCP segments or
4048 * UDP datagrams.
4050 * Does _not_ work with layer2 filtering.
4052 if (oif != NULL || args->eh != NULL ||
4053 (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4054 (lc.proto != IPPROTO_TCP &&
4055 lc.proto != IPPROTO_UDP))
4056 break;
4057 /* FALL THROUGH */
4058 case O_LIMIT:
4059 case O_KEEP_STATE:
4060 if (hlen == 0)
4061 break;
4062 s = ipfw_state_install(ctx, f,
4063 (ipfw_insn_limit *)cmd, args, lc.tcp);
4064 if (s == NULL) {
4065 retval = IP_FW_DENY;
4066 goto done; /* error/limit violation */
4068 s->st_pcnt++;
4069 s->st_bcnt += lc.ip_len;
4071 if (s->st_type == O_REDIRECT) {
4072 struct in_addr oaddr;
4073 uint16_t oport;
4074 struct ipfw_xlat *slave_x, *x;
4075 struct ipfw_state *dup;
4077 x = (struct ipfw_xlat *)s;
4078 ipfw_xlate(x, m, &oaddr, &oport);
4079 m = ipfw_rehashm(m, hlen, args, &lc,
4080 &ip);
4081 if (m == NULL) {
4082 ipfw_state_del(ctx, s);
4083 goto pullup_failed;
4086 cpuid = netisr_hashcpu(
4087 m->m_pkthdr.hash);
4089 slave_x = (struct ipfw_xlat *)
4090 ipfw_state_alloc(ctx, &args->f_id,
4091 O_REDIRECT, f->cross_rules[cpuid],
4092 lc.tcp);
4093 if (slave_x == NULL) {
4094 ipfw_state_del(ctx, s);
4095 retval = IP_FW_DENY;
4096 goto done;
4098 slave_x->xlat_addr = oaddr.s_addr;
4099 slave_x->xlat_port = oport;
4100 slave_x->xlat_dir = MATCH_REVERSE;
4101 slave_x->xlat_flags |=
4102 IPFW_STATE_F_XLATSRC |
4103 IPFW_STATE_F_XLATSLAVE;
4105 slave_x->xlat_pair = x;
4106 slave_x->xlat_pcpu = mycpuid;
4107 x->xlat_pair = slave_x;
4108 x->xlat_pcpu = cpuid;
4110 ctx->ipfw_xlated++;
4111 if (cpuid != mycpuid) {
4112 ctx->ipfw_xlate_split++;
4113 ipfw_xlate_redispatch(
4114 m, cpuid, x,
4115 IPFW_XLATE_INSERT |
4116 IPFW_XLATE_FORWARD);
4117 args->m = NULL;
4118 return (IP_FW_REDISPATCH);
4121 dup = ipfw_state_link(ctx,
4122 &slave_x->xlat_st);
4123 if (dup != NULL) {
4124 ctx->ipfw_xlate_conflicts++;
4125 if (IPFW_STATE_ISDEAD(dup)) {
4126 ipfw_state_remove(ctx,
4127 dup);
4128 dup = ipfw_state_link(
4129 ctx, &slave_x->xlat_st);
4131 if (dup != NULL) {
4132 if (bootverbose) {
4133 kprintf("ipfw: "
4134 "slave %u state "
4135 "conflicts "
4136 "%u state\n",
4137 x->xlat_type,
4138 s->st_type);
4140 ipfw_state_del(ctx, s);
4141 return (IP_FW_DENY);
4143 ctx->ipfw_xlate_cresolved++;
4146 match = 1;
4147 break;
4149 case O_PROBE_STATE:
4150 case O_CHECK_STATE:
4152 * States are checked at the first keep-state
4153 * check-state occurrence, with the result
4154 * being stored in dyn_dir. The compiler
4155 * introduces a PROBE_STATE instruction for
4156 * us when we have a KEEP_STATE/LIMIT/RDR
4157 * (because PROBE_STATE needs to be run first).
4159 s = NULL;
4160 if (dyn_dir == MATCH_UNKNOWN) {
4161 s = ipfw_state_lookup(ctx,
4162 &args->f_id, &dyn_dir, lc.tcp);
4164 if (s == NULL ||
4165 (s->st_type == O_REDIRECT &&
4166 (args->eh != NULL ||
4167 (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4168 (lc.proto != IPPROTO_TCP &&
4169 lc.proto != IPPROTO_UDP)))) {
4171 * State not found. If CHECK_STATE,
4172 * skip to next rule, if PROBE_STATE
4173 * just ignore and continue with next
4174 * opcode.
4176 if (cmd->opcode == O_CHECK_STATE)
4177 goto next_rule;
4178 match = 1;
4179 break;
4182 s->st_pcnt++;
4183 s->st_bcnt += lc.ip_len;
4185 if (s->st_type == O_REDIRECT) {
4186 struct ipfw_xlat *x =
4187 (struct ipfw_xlat *)s;
4189 if (oif != NULL &&
4190 x->xlat_ifp == NULL) {
4191 KASSERT(x->xlat_flags &
4192 IPFW_STATE_F_XLATSLAVE,
4193 ("master rdr state "
4194 "missing ifp"));
4195 x->xlat_ifp = oif;
4196 } else if (
4197 (oif != NULL && x->xlat_ifp!=oif) ||
4198 (oif == NULL &&
4199 x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4200 retval = IP_FW_DENY;
4201 goto done;
4203 if (x->xlat_dir != dyn_dir)
4204 goto skip_xlate;
4206 ipfw_xlate(x, m, NULL, NULL);
4207 m = ipfw_rehashm(m, hlen, args, &lc,
4208 &ip);
4209 if (m == NULL)
4210 goto pullup_failed;
4212 cpuid = netisr_hashcpu(
4213 m->m_pkthdr.hash);
4214 if (cpuid != mycpuid) {
4215 uint32_t xlate = 0;
4217 if (oif != NULL) {
4218 xlate |=
4219 IPFW_XLATE_OUTPUT;
4221 if (dyn_dir == MATCH_FORWARD) {
4222 xlate |=
4223 IPFW_XLATE_FORWARD;
4225 ipfw_xlate_redispatch(m, cpuid,
4226 x, xlate);
4227 args->m = NULL;
4228 return (IP_FW_REDISPATCH);
4231 KKASSERT(x->xlat_pcpu == mycpuid);
4232 ipfw_state_update(&args->f_id, dyn_dir,
4233 lc.tcp, &x->xlat_pair->xlat_st);
4235 skip_xlate:
4237 * Found a rule from a state; jump to the
4238 * 'action' part of the rule.
4240 f = s->st_rule;
4241 KKASSERT(f->cpuid == mycpuid);
4243 cmd = ACTION_PTR(f);
4244 l = f->cmd_len - f->act_ofs;
4245 dyn_f = f;
4246 goto check_body;
4248 case O_ACCEPT:
4249 retval = IP_FW_PASS; /* accept */
4250 goto done;
4252 case O_DEFRAG:
4253 if (f->cross_rules == NULL) {
4255 * This rule was not completely setup;
4256 * move on to the next rule.
4258 goto next_rule;
4262 * Don't defrag for l2 packets, output packets
4263 * or non-fragments.
4265 if (oif != NULL || args->eh != NULL ||
4266 (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4267 goto next_rule;
4269 ctx->ipfw_frags++;
4270 m = ip_reass(m);
4271 args->m = m;
4272 if (m == NULL) {
4273 retval = IP_FW_PASS;
4274 goto done;
4276 ctx->ipfw_defraged++;
4277 KASSERT((m->m_flags & M_HASH) == 0,
4278 ("hash not cleared"));
4280 /* Update statistics */
4281 f->pcnt++;
4282 f->bcnt += lc.ip_len;
4283 f->timestamp = time_second;
4285 ip = mtod(m, struct ip *);
4286 hlen = ip->ip_hl << 2;
4287 ip->ip_len += hlen;
4289 ip->ip_len = htons(ip->ip_len);
4290 ip->ip_off = htons(ip->ip_off);
4292 ip_hashfn(&m, 0);
4293 args->m = m;
4294 if (m == NULL)
4295 goto pullup_failed;
4297 KASSERT(m->m_flags & M_HASH, ("no hash"));
4298 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4299 if (cpuid != mycpuid) {
4301 * NOTE:
4302 * ip_len/ip_off are in network byte
4303 * order.
4305 ctx->ipfw_defrag_remote++;
4306 ipfw_defrag_redispatch(m, cpuid, f);
4307 args->m = NULL;
4308 return (IP_FW_REDISPATCH);
4311 /* 'm' might be changed by ip_hashfn(). */
4312 ip = mtod(m, struct ip *);
4313 ip->ip_len = ntohs(ip->ip_len);
4314 ip->ip_off = ntohs(ip->ip_off);
4316 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4317 if (m == NULL)
4318 goto pullup_failed;
4320 /* Move on. */
4321 goto next_rule;
4323 case O_PIPE:
4324 case O_QUEUE:
4325 args->rule = f; /* report matching rule */
4326 args->cookie = cmd->arg1;
4327 retval = IP_FW_DUMMYNET;
4328 goto done;
4330 case O_DIVERT:
4331 case O_TEE:
4332 if (args->eh) /* not on layer 2 */
4333 break;
4335 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4336 sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4337 if (mtag == NULL) {
4338 retval = IP_FW_DENY;
4339 goto done;
4341 divinfo = m_tag_data(mtag);
4343 divinfo->skipto = f->rulenum;
4344 divinfo->port = cmd->arg1;
4345 divinfo->tee = (cmd->opcode == O_TEE);
4346 m_tag_prepend(m, mtag);
4348 args->cookie = cmd->arg1;
4349 retval = (cmd->opcode == O_DIVERT) ?
4350 IP_FW_DIVERT : IP_FW_TEE;
4351 goto done;
4353 case O_COUNT:
4354 case O_SKIPTO:
4355 f->pcnt++; /* update stats */
4356 f->bcnt += lc.ip_len;
4357 f->timestamp = time_second;
4358 if (cmd->opcode == O_COUNT)
4359 goto next_rule;
4360 /* handle skipto */
4361 if (f->next_rule == NULL)
4362 lookup_next_rule(f);
4363 f = f->next_rule;
4364 goto again;
4366 case O_REJECT:
4368 * Drop the packet and send a reject notice
4369 * if the packet is not ICMP (or is an ICMP
4370 * query), and it is not multicast/broadcast.
4372 if (hlen > 0 &&
4373 (lc.proto != IPPROTO_ICMP ||
4374 is_icmp_query(ip)) &&
4375 !(m->m_flags & (M_BCAST|M_MCAST)) &&
4376 !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4377 send_reject(args, cmd->arg1,
4378 lc.offset, lc.ip_len);
4379 retval = IP_FW_DENY;
4380 goto done;
4382 /* FALLTHROUGH */
4383 case O_DENY:
4384 retval = IP_FW_DENY;
4385 goto done;
4387 case O_FORWARD_IP:
4388 if (args->eh) /* not valid on layer2 pkts */
4389 break;
4390 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4391 struct sockaddr_in *sin;
4393 mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4394 sizeof(*sin), M_INTWAIT | M_NULLOK);
4395 if (mtag == NULL) {
4396 retval = IP_FW_DENY;
4397 goto done;
4399 sin = m_tag_data(mtag);
4401 /* Structure copy */
4402 *sin = ((ipfw_insn_sa *)cmd)->sa;
4404 m_tag_prepend(m, mtag);
4405 m->m_pkthdr.fw_flags |=
4406 IPFORWARD_MBUF_TAGGED;
4407 m->m_pkthdr.fw_flags &=
4408 ~BRIDGE_MBUF_TAGGED;
4410 retval = IP_FW_PASS;
4411 goto done;
4413 default:
4414 panic("-- unknown opcode %d", cmd->opcode);
4415 } /* end of switch() on opcodes */
4417 if (cmd->len & F_NOT)
4418 match = !match;
4420 if (match) {
4421 if (cmd->len & F_OR)
4422 skip_or = 1;
4423 } else {
4424 if (!(cmd->len & F_OR)) /* not an OR block, */
4425 break; /* try next rule */
4428 } /* end of inner for, scan opcodes */
4430 next_rule:; /* try next rule */
4432 } /* end of outer for, scan rules */
4433 kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4434 return IP_FW_DENY;
4436 done:
4437 /* Update statistics */
4438 f->pcnt++;
4439 f->bcnt += lc.ip_len;
4440 f->timestamp = time_second;
4441 return retval;
4443 pullup_failed:
4444 if (fw_verbose)
4445 kprintf("pullup failed\n");
4446 return IP_FW_DENY;
4449 static struct mbuf *
4450 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4452 struct m_tag *mtag;
4453 struct dn_pkt *pkt;
4454 ipfw_insn *cmd;
4455 const struct ipfw_flow_id *id;
4456 struct dn_flow_id *fid;
4458 M_ASSERTPKTHDR(m);
4460 mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4461 M_INTWAIT | M_NULLOK);
4462 if (mtag == NULL) {
4463 m_freem(m);
4464 return (NULL);
4466 m_tag_prepend(m, mtag);
4468 pkt = m_tag_data(mtag);
4469 bzero(pkt, sizeof(*pkt));
4471 cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4472 if (cmd->opcode == O_LOG)
4473 cmd += F_LEN(cmd);
4474 KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4475 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4477 pkt->dn_m = m;
4478 pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4479 pkt->ifp = fwa->oif;
4480 pkt->pipe_nr = pipe_nr;
4482 pkt->cpuid = mycpuid;
4483 pkt->msgport = netisr_curport();
4485 id = &fwa->f_id;
4486 fid = &pkt->id;
4487 fid->fid_dst_ip = id->dst_ip;
4488 fid->fid_src_ip = id->src_ip;
4489 fid->fid_dst_port = id->dst_port;
4490 fid->fid_src_port = id->src_port;
4491 fid->fid_proto = id->proto;
4492 fid->fid_flags = id->flags;
4494 ipfw_ref_rule(fwa->rule);
4495 pkt->dn_priv = fwa->rule;
4496 pkt->dn_unref_priv = ipfw_unref_rule;
4498 if (cmd->opcode == O_PIPE)
4499 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4501 m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4502 return (m);
4506 * When a rule is added/deleted, clear the next_rule pointers in all rules.
4507 * These will be reconstructed on the fly as packets are matched.
4509 static void
4510 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4512 struct ip_fw *rule;
4514 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4515 rule->next_rule = NULL;
4518 static void
4519 ipfw_inc_static_count(struct ip_fw *rule)
4521 /* Static rule's counts are updated only on CPU0 */
4522 KKASSERT(mycpuid == 0);
4524 static_count++;
4525 static_ioc_len += IOC_RULESIZE(rule);
4528 static void
4529 ipfw_dec_static_count(struct ip_fw *rule)
4531 int l = IOC_RULESIZE(rule);
4533 /* Static rule's counts are updated only on CPU0 */
4534 KKASSERT(mycpuid == 0);
4536 KASSERT(static_count > 0, ("invalid static count %u", static_count));
4537 static_count--;
4539 KASSERT(static_ioc_len >= l,
4540 ("invalid static len %u", static_ioc_len));
4541 static_ioc_len -= l;
4544 static void
4545 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4547 if (fwmsg->sibling != NULL) {
4548 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4549 fwmsg->sibling->sibling = rule;
4551 fwmsg->sibling = rule;
4554 static struct ip_fw *
4555 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4557 struct ip_fw *rule;
4559 rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4561 rule->act_ofs = ioc_rule->act_ofs;
4562 rule->cmd_len = ioc_rule->cmd_len;
4563 rule->rulenum = ioc_rule->rulenum;
4564 rule->set = ioc_rule->set;
4565 rule->usr_flags = ioc_rule->usr_flags;
4567 bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4569 rule->refcnt = 1;
4570 rule->cpuid = mycpuid;
4571 rule->rule_flags = rule_flags;
4573 return rule;
4576 static void
4577 ipfw_add_rule_dispatch(netmsg_t nmsg)
4579 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4580 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4581 struct ip_fw *rule;
4583 ASSERT_NETISR_NCPUS(mycpuid);
4585 rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4588 * Insert rule into the pre-determined position
4590 if (fwmsg->prev_rule != NULL) {
4591 struct ip_fw *prev, *next;
4593 prev = fwmsg->prev_rule;
4594 KKASSERT(prev->cpuid == mycpuid);
4596 next = fwmsg->next_rule;
4597 KKASSERT(next->cpuid == mycpuid);
4599 rule->next = next;
4600 prev->next = rule;
4603 * Move to the position on the next CPU
4604 * before the msg is forwarded.
4606 fwmsg->prev_rule = prev->sibling;
4607 fwmsg->next_rule = next->sibling;
4608 } else {
4609 KKASSERT(fwmsg->next_rule == NULL);
4610 rule->next = ctx->ipfw_layer3_chain;
4611 ctx->ipfw_layer3_chain = rule;
4614 /* Link rule CPU sibling */
4615 ipfw_link_sibling(fwmsg, rule);
4617 ipfw_flush_rule_ptrs(ctx);
4619 if (mycpuid == 0) {
4620 /* Statistics only need to be updated once */
4621 ipfw_inc_static_count(rule);
4623 /* Return the rule on CPU0 */
4624 nmsg->lmsg.u.ms_resultp = rule;
4627 if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4628 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4630 if (fwmsg->cross_rules != NULL) {
4631 /* Save rules for later use. */
4632 fwmsg->cross_rules[mycpuid] = rule;
4635 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4638 static void
4639 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4641 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4642 struct ip_fw *rule = fwmsg->sibling;
4643 int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4645 ASSERT_NETISR_NCPUS(mycpuid);
4646 KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4647 ("not crossref rule"));
4649 rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4650 memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4652 fwmsg->sibling = rule->sibling;
4653 netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4657 * Add a new rule to the list. Copy the rule into a malloc'ed area,
4658 * then possibly create a rule number and add the rule to the list.
4659 * Update the rule_number in the input struct so the caller knows
4660 * it as well.
4662 static void
4663 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4665 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4666 struct netmsg_ipfw fwmsg;
4667 struct ip_fw *f, *prev, *rule;
4669 ASSERT_NETISR0;
4672 * If rulenum is 0, find highest numbered rule before the
4673 * default rule, and add rule number incremental step.
4675 if (ioc_rule->rulenum == 0) {
4676 int step = autoinc_step;
4678 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4679 step <= IPFW_AUTOINC_STEP_MAX);
4682 * Locate the highest numbered rule before default
4684 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4685 if (f->rulenum == IPFW_DEFAULT_RULE)
4686 break;
4687 ioc_rule->rulenum = f->rulenum;
4689 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4690 ioc_rule->rulenum += step;
4692 KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4693 ioc_rule->rulenum != 0,
4694 ("invalid rule num %d", ioc_rule->rulenum));
4697 * Now find the right place for the new rule in the sorted list.
4699 for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4700 prev = f, f = f->next) {
4701 if (f->rulenum > ioc_rule->rulenum) {
4702 /* Found the location */
4703 break;
4706 KASSERT(f != NULL, ("no default rule?!"));
4709 * Duplicate the rule onto each CPU.
4710 * The rule duplicated on CPU0 will be returned.
4712 bzero(&fwmsg, sizeof(fwmsg));
4713 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4714 ipfw_add_rule_dispatch);
4715 fwmsg.ioc_rule = ioc_rule;
4716 fwmsg.prev_rule = prev;
4717 fwmsg.next_rule = prev == NULL ? NULL : f;
4718 fwmsg.rule_flags = rule_flags;
4719 if (rule_flags & IPFW_RULE_F_CROSSREF) {
4720 fwmsg.cross_rules = kmalloc(
4721 sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4722 M_WAITOK | M_ZERO);
4725 netisr_domsg_global(&fwmsg.base);
4726 KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4728 rule = fwmsg.base.lmsg.u.ms_resultp;
4729 KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4731 if (fwmsg.cross_rules != NULL) {
4732 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4733 MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4734 fwmsg.sibling = rule;
4735 netisr_domsg_global(&fwmsg.base);
4736 KKASSERT(fwmsg.sibling == NULL);
4738 kfree(fwmsg.cross_rules, M_TEMP);
4740 #ifdef KLD_MODULE
4741 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4742 #endif
4745 DPRINTF("++ installed rule %d, static count now %d\n",
4746 rule->rulenum, static_count);
4750 * Free storage associated with a static rule (including derived
4751 * states/tracks).
4752 * The caller is in charge of clearing rule pointers to avoid
4753 * dangling pointers.
4754 * @return a pointer to the next entry.
4755 * Arguments are not checked, so they better be correct.
4757 static struct ip_fw *
4758 ipfw_delete_rule(struct ipfw_context *ctx,
4759 struct ip_fw *prev, struct ip_fw *rule)
4761 struct ip_fw *n;
4763 n = rule->next;
4764 if (prev == NULL)
4765 ctx->ipfw_layer3_chain = n;
4766 else
4767 prev->next = n;
4769 /* Mark the rule as invalid */
4770 rule->rule_flags |= IPFW_RULE_F_INVALID;
4771 rule->next_rule = NULL;
4772 rule->sibling = NULL;
4773 #ifdef foo
4774 /* Don't reset cpuid here; keep various assertion working */
4775 rule->cpuid = -1;
4776 #endif
4778 /* Statistics only need to be updated once */
4779 if (mycpuid == 0)
4780 ipfw_dec_static_count(rule);
4782 if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4783 /* Try to free this rule */
4784 ipfw_free_rule(rule);
4785 } else {
4786 /* TODO: check staging area. */
4787 if (mycpuid == 0) {
4788 rule->next = ipfw_gd.ipfw_crossref_free;
4789 ipfw_gd.ipfw_crossref_free = rule;
4793 /* Return the next rule */
4794 return n;
4797 static void
4798 ipfw_flush_dispatch(netmsg_t nmsg)
4800 int kill_default = nmsg->lmsg.u.ms_result;
4801 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4802 struct ip_fw *rule;
4804 ASSERT_NETISR_NCPUS(mycpuid);
4807 * Flush states.
4809 ipfw_state_flush(ctx, NULL);
4810 KASSERT(ctx->ipfw_state_cnt == 0,
4811 ("%d pcpu states remain", ctx->ipfw_state_cnt));
4812 ctx->ipfw_state_loosecnt = 0;
4813 ctx->ipfw_state_lastexp = 0;
4816 * Flush tracks.
4818 ipfw_track_flush(ctx, NULL);
4819 ctx->ipfw_track_lastexp = 0;
4820 if (ctx->ipfw_trkcnt_spare != NULL) {
4821 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4822 ctx->ipfw_trkcnt_spare = NULL;
4825 ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4827 while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4828 (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4829 ipfw_delete_rule(ctx, NULL, rule);
4831 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4835 * Deletes all rules from a chain (including the default rule
4836 * if the second argument is set).
4838 static void
4839 ipfw_flush(int kill_default)
4841 struct netmsg_base nmsg;
4842 #ifdef INVARIANTS
4843 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4844 int state_cnt;
4845 #endif
4847 ASSERT_NETISR0;
4850 * If 'kill_default' then caller has done the necessary
4851 * msgport syncing; unnecessary to do it again.
4853 if (!kill_default) {
4855 * Let ipfw_chk() know the rules are going to
4856 * be flushed, so it could jump directly to
4857 * the default rule.
4859 ipfw_flushing = 1;
4860 /* XXX use priority sync */
4861 netmsg_service_sync();
4865 * Press the 'flush' button
4867 bzero(&nmsg, sizeof(nmsg));
4868 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4869 ipfw_flush_dispatch);
4870 nmsg.lmsg.u.ms_result = kill_default;
4871 netisr_domsg_global(&nmsg);
4872 ipfw_gd.ipfw_state_loosecnt = 0;
4873 ipfw_gd.ipfw_state_globexp = 0;
4874 ipfw_gd.ipfw_track_globexp = 0;
4876 #ifdef INVARIANTS
4877 state_cnt = ipfw_state_cntcoll();
4878 KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4880 KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4881 ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4883 if (kill_default) {
4884 KASSERT(static_count == 0,
4885 ("%u static rules remain", static_count));
4886 KASSERT(static_ioc_len == 0,
4887 ("%u bytes of static rules remain", static_ioc_len));
4888 } else {
4889 KASSERT(static_count == 1,
4890 ("%u static rules remain", static_count));
4891 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4892 ("%u bytes of static rules remain, should be %lu",
4893 static_ioc_len,
4894 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4896 #endif
4898 /* Flush is done */
4899 ipfw_flushing = 0;
4902 static void
4903 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4905 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4906 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4907 struct ip_fw *rule, *prev;
4909 ASSERT_NETISR_NCPUS(mycpuid);
4911 rule = dmsg->start_rule;
4912 KKASSERT(rule->cpuid == mycpuid);
4913 dmsg->start_rule = rule->sibling;
4915 prev = dmsg->prev_rule;
4916 if (prev != NULL) {
4917 KKASSERT(prev->cpuid == mycpuid);
4920 * Move to the position on the next CPU
4921 * before the msg is forwarded.
4923 dmsg->prev_rule = prev->sibling;
4927 * flush pointers outside the loop, then delete all matching
4928 * rules. 'prev' remains the same throughout the cycle.
4930 ipfw_flush_rule_ptrs(ctx);
4931 while (rule && rule->rulenum == dmsg->rulenum) {
4932 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4933 /* Flush states generated by this rule. */
4934 ipfw_state_flush(ctx, rule);
4936 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4937 /* Flush tracks generated by this rule. */
4938 ipfw_track_flush(ctx, rule);
4940 rule = ipfw_delete_rule(ctx, prev, rule);
4943 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4946 static int
4947 ipfw_alt_delete_rule(uint16_t rulenum)
4949 struct ip_fw *prev, *rule;
4950 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4951 struct netmsg_del dmsg;
4953 ASSERT_NETISR0;
4956 * Locate first rule to delete
4958 for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4959 rule && rule->rulenum < rulenum;
4960 prev = rule, rule = rule->next)
4961 ; /* EMPTY */
4962 if (rule->rulenum != rulenum)
4963 return EINVAL;
4966 * Get rid of the rule duplications on all CPUs
4968 bzero(&dmsg, sizeof(dmsg));
4969 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4970 ipfw_alt_delete_rule_dispatch);
4971 dmsg.prev_rule = prev;
4972 dmsg.start_rule = rule;
4973 dmsg.rulenum = rulenum;
4975 netisr_domsg_global(&dmsg.base);
4976 KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4977 return 0;
4980 static void
4981 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4983 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4984 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4985 struct ip_fw *prev, *rule;
4986 #ifdef INVARIANTS
4987 int del = 0;
4988 #endif
4990 ASSERT_NETISR_NCPUS(mycpuid);
4992 ipfw_flush_rule_ptrs(ctx);
4994 prev = NULL;
4995 rule = ctx->ipfw_layer3_chain;
4996 while (rule != NULL) {
4997 if (rule->set == dmsg->from_set) {
4998 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4999 /* Flush states generated by this rule. */
5000 ipfw_state_flush(ctx, rule);
5002 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
5003 /* Flush tracks generated by this rule. */
5004 ipfw_track_flush(ctx, rule);
5006 rule = ipfw_delete_rule(ctx, prev, rule);
5007 #ifdef INVARIANTS
5008 del = 1;
5009 #endif
5010 } else {
5011 prev = rule;
5012 rule = rule->next;
5015 KASSERT(del, ("no match set?!"));
5017 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5020 static int
5021 ipfw_alt_delete_ruleset(uint8_t set)
5023 struct netmsg_del dmsg;
5024 int del;
5025 struct ip_fw *rule;
5026 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5028 ASSERT_NETISR0;
5031 * Check whether the 'set' exists. If it exists,
5032 * then check whether any rules within the set will
5033 * try to create states.
5035 del = 0;
5036 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5037 if (rule->set == set)
5038 del = 1;
5040 if (!del)
5041 return 0; /* XXX EINVAL? */
5044 * Delete this set
5046 bzero(&dmsg, sizeof(dmsg));
5047 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5048 ipfw_alt_delete_ruleset_dispatch);
5049 dmsg.from_set = set;
5050 netisr_domsg_global(&dmsg.base);
5052 return 0;
5055 static void
5056 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5058 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5059 struct ip_fw *rule;
5061 ASSERT_NETISR_NCPUS(mycpuid);
5063 rule = dmsg->start_rule;
5064 KKASSERT(rule->cpuid == mycpuid);
5067 * Move to the position on the next CPU
5068 * before the msg is forwarded.
5070 dmsg->start_rule = rule->sibling;
5072 while (rule && rule->rulenum <= dmsg->rulenum) {
5073 if (rule->rulenum == dmsg->rulenum)
5074 rule->set = dmsg->to_set;
5075 rule = rule->next;
5077 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5080 static int
5081 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5083 struct netmsg_del dmsg;
5084 struct netmsg_base *nmsg;
5085 struct ip_fw *rule;
5086 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5088 ASSERT_NETISR0;
5091 * Locate first rule to move
5093 for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5094 rule = rule->next) {
5095 if (rule->rulenum == rulenum && rule->set != set)
5096 break;
5098 if (rule == NULL || rule->rulenum > rulenum)
5099 return 0; /* XXX error? */
5101 bzero(&dmsg, sizeof(dmsg));
5102 nmsg = &dmsg.base;
5103 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5104 ipfw_alt_move_rule_dispatch);
5105 dmsg.start_rule = rule;
5106 dmsg.rulenum = rulenum;
5107 dmsg.to_set = set;
5109 netisr_domsg_global(nmsg);
5110 KKASSERT(dmsg.start_rule == NULL);
5111 return 0;
5114 static void
5115 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5117 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5118 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5119 struct ip_fw *rule;
5121 ASSERT_NETISR_NCPUS(mycpuid);
5123 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5124 if (rule->set == dmsg->from_set)
5125 rule->set = dmsg->to_set;
5127 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5130 static int
5131 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5133 struct netmsg_del dmsg;
5134 struct netmsg_base *nmsg;
5136 ASSERT_NETISR0;
5138 bzero(&dmsg, sizeof(dmsg));
5139 nmsg = &dmsg.base;
5140 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5141 ipfw_alt_move_ruleset_dispatch);
5142 dmsg.from_set = from_set;
5143 dmsg.to_set = to_set;
5145 netisr_domsg_global(nmsg);
5146 return 0;
5149 static void
5150 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5152 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5153 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5154 struct ip_fw *rule;
5156 ASSERT_NETISR_NCPUS(mycpuid);
5158 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5159 if (rule->set == dmsg->from_set)
5160 rule->set = dmsg->to_set;
5161 else if (rule->set == dmsg->to_set)
5162 rule->set = dmsg->from_set;
5164 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5167 static int
5168 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5170 struct netmsg_del dmsg;
5171 struct netmsg_base *nmsg;
5173 ASSERT_NETISR0;
5175 bzero(&dmsg, sizeof(dmsg));
5176 nmsg = &dmsg.base;
5177 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5178 ipfw_alt_swap_ruleset_dispatch);
5179 dmsg.from_set = set1;
5180 dmsg.to_set = set2;
5182 netisr_domsg_global(nmsg);
5183 return 0;
5187 * Remove all rules with given number, and also do set manipulation.
5189 * The argument is an uint32_t. The low 16 bit are the rule or set number,
5190 * the next 8 bits are the new set, the top 8 bits are the command:
5192 * 0 delete rules with given number
5193 * 1 delete rules with given set number
5194 * 2 move rules with given number to new set
5195 * 3 move rules with given set number to new set
5196 * 4 swap sets with given numbers
5198 static int
5199 ipfw_ctl_alter(uint32_t arg)
5201 uint16_t rulenum;
5202 uint8_t cmd, new_set;
5203 int error = 0;
5205 ASSERT_NETISR0;
5207 rulenum = arg & 0xffff;
5208 cmd = (arg >> 24) & 0xff;
5209 new_set = (arg >> 16) & 0xff;
5211 if (cmd > 4)
5212 return EINVAL;
5213 if (new_set >= IPFW_DEFAULT_SET)
5214 return EINVAL;
5215 if (cmd == 0 || cmd == 2) {
5216 if (rulenum == IPFW_DEFAULT_RULE)
5217 return EINVAL;
5218 } else {
5219 if (rulenum >= IPFW_DEFAULT_SET)
5220 return EINVAL;
5223 switch (cmd) {
5224 case 0: /* delete rules with given number */
5225 error = ipfw_alt_delete_rule(rulenum);
5226 break;
5228 case 1: /* delete all rules with given set number */
5229 error = ipfw_alt_delete_ruleset(rulenum);
5230 break;
5232 case 2: /* move rules with given number to new set */
5233 error = ipfw_alt_move_rule(rulenum, new_set);
5234 break;
5236 case 3: /* move rules with given set number to new set */
5237 error = ipfw_alt_move_ruleset(rulenum, new_set);
5238 break;
5240 case 4: /* swap two sets */
5241 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5242 break;
5244 return error;
5248 * Clear counters for a specific rule.
5250 static void
5251 clear_counters(struct ip_fw *rule, int log_only)
5253 ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5255 if (log_only == 0) {
5256 rule->bcnt = rule->pcnt = 0;
5257 rule->timestamp = 0;
5259 if (l->o.opcode == O_LOG)
5260 l->log_left = l->max_log;
5263 static void
5264 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5266 struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5267 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5268 struct ip_fw *rule;
5270 ASSERT_NETISR_NCPUS(mycpuid);
5272 if (zmsg->rulenum == 0) {
5273 KKASSERT(zmsg->start_rule == NULL);
5275 ctx->ipfw_norule_counter = 0;
5276 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5277 clear_counters(rule, zmsg->log_only);
5278 } else {
5279 struct ip_fw *start = zmsg->start_rule;
5281 KKASSERT(start->cpuid == mycpuid);
5282 KKASSERT(start->rulenum == zmsg->rulenum);
5285 * We can have multiple rules with the same number, so we
5286 * need to clear them all.
5288 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5289 rule = rule->next)
5290 clear_counters(rule, zmsg->log_only);
5293 * Move to the position on the next CPU
5294 * before the msg is forwarded.
5296 zmsg->start_rule = start->sibling;
5298 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5302 * Reset some or all counters on firewall rules.
5303 * @arg frwl is null to clear all entries, or contains a specific
5304 * rule number.
5305 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5307 static int
5308 ipfw_ctl_zero_entry(int rulenum, int log_only)
5310 struct netmsg_zent zmsg;
5311 struct netmsg_base *nmsg;
5312 const char *msg;
5313 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5315 ASSERT_NETISR0;
5317 bzero(&zmsg, sizeof(zmsg));
5318 nmsg = &zmsg.base;
5319 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5320 ipfw_zero_entry_dispatch);
5321 zmsg.log_only = log_only;
5323 if (rulenum == 0) {
5324 msg = log_only ? "ipfw: All logging counts reset.\n"
5325 : "ipfw: Accounting cleared.\n";
5326 } else {
5327 struct ip_fw *rule;
5330 * Locate the first rule with 'rulenum'
5332 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5333 if (rule->rulenum == rulenum)
5334 break;
5336 if (rule == NULL) /* we did not find any matching rules */
5337 return (EINVAL);
5338 zmsg.start_rule = rule;
5339 zmsg.rulenum = rulenum;
5341 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5342 : "ipfw: Entry %d cleared.\n";
5344 netisr_domsg_global(nmsg);
5345 KKASSERT(zmsg.start_rule == NULL);
5347 if (fw_verbose)
5348 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5349 return (0);
5353 * Check validity of the structure before insert.
5354 * Fortunately rules are simple, so this mostly need to check rule sizes.
5356 static int
5357 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5359 int l, cmdlen = 0;
5360 int have_action = 0;
5361 ipfw_insn *cmd;
5363 *rule_flags = 0;
5365 /* Check for valid size */
5366 if (size < sizeof(*rule)) {
5367 kprintf("ipfw: rule too short\n");
5368 return EINVAL;
5370 l = IOC_RULESIZE(rule);
5371 if (l != size) {
5372 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5373 return EINVAL;
5376 /* Check rule number */
5377 if (rule->rulenum == IPFW_DEFAULT_RULE) {
5378 kprintf("ipfw: invalid rule number\n");
5379 return EINVAL;
5383 * Now go for the individual checks. Very simple ones, basically only
5384 * instruction sizes.
5386 for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5387 l -= cmdlen, cmd += cmdlen) {
5388 cmdlen = F_LEN(cmd);
5389 if (cmdlen > l) {
5390 kprintf("ipfw: opcode %d size truncated\n",
5391 cmd->opcode);
5392 return EINVAL;
5395 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5397 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5398 IPFW_ISXLAT(cmd->opcode)) {
5399 /* This rule will generate states. */
5400 *rule_flags |= IPFW_RULE_F_GENSTATE;
5401 if (cmd->opcode == O_LIMIT)
5402 *rule_flags |= IPFW_RULE_F_GENTRACK;
5404 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5405 *rule_flags |= IPFW_RULE_F_CROSSREF;
5406 if (cmd->opcode == O_IP_SRC_IFIP ||
5407 cmd->opcode == O_IP_DST_IFIP) {
5408 *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5409 cmd->arg1 &= IPFW_IFIP_SETTINGS;
5412 switch (cmd->opcode) {
5413 case O_NOP:
5414 case O_PROBE_STATE:
5415 case O_KEEP_STATE:
5416 case O_PROTO:
5417 case O_IP_SRC_ME:
5418 case O_IP_DST_ME:
5419 case O_LAYER2:
5420 case O_IN:
5421 case O_FRAG:
5422 case O_IPFRAG:
5423 case O_IPOPT:
5424 case O_IPLEN:
5425 case O_IPID:
5426 case O_IPTOS:
5427 case O_IPPRECEDENCE:
5428 case O_IPTTL:
5429 case O_IPVER:
5430 case O_TCPWIN:
5431 case O_TCPFLAGS:
5432 case O_TCPOPTS:
5433 case O_ESTAB:
5434 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5435 goto bad_size;
5436 break;
5438 case O_IP_SRC_TABLE:
5439 case O_IP_DST_TABLE:
5440 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5441 goto bad_size;
5442 if (cmd->arg1 >= ipfw_table_max) {
5443 kprintf("ipfw: invalid table id %u, max %d\n",
5444 cmd->arg1, ipfw_table_max);
5445 return EINVAL;
5447 break;
5449 case O_IP_SRC_IFIP:
5450 case O_IP_DST_IFIP:
5451 if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5452 goto bad_size;
5453 break;
5455 case O_ICMPCODE:
5456 case O_ICMPTYPE:
5457 if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5458 goto bad_size;
5459 break;
5461 case O_UID:
5462 case O_GID:
5463 case O_IP_SRC:
5464 case O_IP_DST:
5465 case O_TCPSEQ:
5466 case O_TCPACK:
5467 case O_PROB:
5468 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5469 goto bad_size;
5470 break;
5472 case O_LIMIT:
5473 if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5474 goto bad_size;
5475 break;
5476 case O_REDIRECT:
5477 if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5478 goto bad_size;
5479 break;
5481 case O_LOG:
5482 if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5483 goto bad_size;
5485 ((ipfw_insn_log *)cmd)->log_left =
5486 ((ipfw_insn_log *)cmd)->max_log;
5488 break;
5490 case O_IP_SRC_MASK:
5491 case O_IP_DST_MASK:
5492 if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5493 goto bad_size;
5494 if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5495 kprintf("ipfw: opcode %d, useless rule\n",
5496 cmd->opcode);
5497 return EINVAL;
5499 break;
5501 case O_IP_SRC_SET:
5502 case O_IP_DST_SET:
5503 if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5504 kprintf("ipfw: invalid set size %d\n",
5505 cmd->arg1);
5506 return EINVAL;
5508 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5509 (cmd->arg1+31)/32 )
5510 goto bad_size;
5511 break;
5513 case O_MACADDR2:
5514 if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5515 goto bad_size;
5516 break;
5518 case O_MAC_TYPE:
5519 case O_IP_SRCPORT:
5520 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5521 if (cmdlen < 2 || cmdlen > 31)
5522 goto bad_size;
5523 break;
5525 case O_RECV:
5526 case O_XMIT:
5527 case O_VIA:
5528 if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5529 goto bad_size;
5530 break;
5532 case O_PIPE:
5533 case O_QUEUE:
5534 if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5535 goto bad_size;
5536 goto check_action;
5538 case O_FORWARD_IP:
5539 if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5540 goto bad_size;
5541 } else {
5542 in_addr_t fwd_addr;
5544 fwd_addr = ((ipfw_insn_sa *)cmd)->
5545 sa.sin_addr.s_addr;
5546 if (IN_MULTICAST(ntohl(fwd_addr))) {
5547 kprintf("ipfw: try forwarding to "
5548 "multicast address\n");
5549 return EINVAL;
5552 goto check_action;
5554 case O_FORWARD_MAC: /* XXX not implemented yet */
5555 case O_CHECK_STATE:
5556 case O_COUNT:
5557 case O_ACCEPT:
5558 case O_DENY:
5559 case O_REJECT:
5560 case O_SKIPTO:
5561 case O_DIVERT:
5562 case O_TEE:
5563 case O_DEFRAG:
5564 if (cmdlen != F_INSN_SIZE(ipfw_insn))
5565 goto bad_size;
5566 check_action:
5567 if (have_action) {
5568 kprintf("ipfw: opcode %d, multiple actions"
5569 " not allowed\n",
5570 cmd->opcode);
5571 return EINVAL;
5573 have_action = 1;
5574 if (l != cmdlen) {
5575 kprintf("ipfw: opcode %d, action must be"
5576 " last opcode\n",
5577 cmd->opcode);
5578 return EINVAL;
5580 break;
5581 default:
5582 kprintf("ipfw: opcode %d, unknown opcode\n",
5583 cmd->opcode);
5584 return EINVAL;
5587 if (have_action == 0) {
5588 kprintf("ipfw: missing action\n");
5589 return EINVAL;
5591 return 0;
5593 bad_size:
5594 kprintf("ipfw: opcode %d size %d wrong\n",
5595 cmd->opcode, cmdlen);
5596 return EINVAL;
5599 static int
5600 ipfw_ctl_add_rule(struct sockopt *sopt)
5602 struct ipfw_ioc_rule *ioc_rule;
5603 size_t size;
5604 uint32_t rule_flags;
5605 int error;
5607 ASSERT_NETISR0;
5609 size = sopt->sopt_valsize;
5610 if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5611 size < sizeof(*ioc_rule)) {
5612 return EINVAL;
5614 if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5615 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5616 IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5618 ioc_rule = sopt->sopt_val;
5620 error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5621 if (error)
5622 return error;
5624 ipfw_add_rule(ioc_rule, rule_flags);
5626 if (sopt->sopt_dir == SOPT_GET)
5627 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5628 return 0;
5631 static void *
5632 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5633 struct ipfw_ioc_rule *ioc_rule)
5635 const struct ip_fw *sibling;
5636 #ifdef INVARIANTS
5637 int i;
5638 #endif
5640 ASSERT_NETISR0;
5641 KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5643 ioc_rule->act_ofs = rule->act_ofs;
5644 ioc_rule->cmd_len = rule->cmd_len;
5645 ioc_rule->rulenum = rule->rulenum;
5646 ioc_rule->set = rule->set;
5647 ioc_rule->usr_flags = rule->usr_flags;
5649 ioc_rule->set_disable = ctx->ipfw_set_disable;
5650 ioc_rule->static_count = static_count;
5651 ioc_rule->static_len = static_ioc_len;
5654 * Visit (read-only) all of the rule's duplications to get
5655 * the necessary statistics
5657 #ifdef INVARIANTS
5658 i = 0;
5659 #endif
5660 ioc_rule->pcnt = 0;
5661 ioc_rule->bcnt = 0;
5662 ioc_rule->timestamp = 0;
5663 for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5664 ioc_rule->pcnt += sibling->pcnt;
5665 ioc_rule->bcnt += sibling->bcnt;
5666 if (sibling->timestamp > ioc_rule->timestamp)
5667 ioc_rule->timestamp = sibling->timestamp;
5668 #ifdef INVARIANTS
5669 ++i;
5670 #endif
5672 KASSERT(i == netisr_ncpus,
5673 ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5675 bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5677 return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5680 static boolean_t
5681 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5683 struct ipfw_ioc_flowid *ioc_id;
5685 if (trk->tc_expire == 0) {
5686 /* Not a scanned one. */
5687 return (FALSE);
5690 ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5691 0 : trk->tc_expire - time_uptime;
5692 ioc_state->pcnt = 0;
5693 ioc_state->bcnt = 0;
5695 ioc_state->dyn_type = O_LIMIT_PARENT;
5696 ioc_state->count = trk->tc_count;
5698 ioc_state->rulenum = trk->tc_rulenum;
5700 ioc_id = &ioc_state->id;
5701 ioc_id->type = ETHERTYPE_IP;
5702 ioc_id->u.ip.proto = trk->tc_proto;
5703 ioc_id->u.ip.src_ip = trk->tc_saddr;
5704 ioc_id->u.ip.dst_ip = trk->tc_daddr;
5705 ioc_id->u.ip.src_port = trk->tc_sport;
5706 ioc_id->u.ip.dst_port = trk->tc_dport;
5708 return (TRUE);
5711 static boolean_t
5712 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5714 struct ipfw_ioc_flowid *ioc_id;
5716 if (IPFW_STATE_SCANSKIP(s))
5717 return (FALSE);
5719 ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5720 0 : s->st_expire - time_uptime;
5721 ioc_state->pcnt = s->st_pcnt;
5722 ioc_state->bcnt = s->st_bcnt;
5724 ioc_state->dyn_type = s->st_type;
5725 ioc_state->count = 0;
5727 ioc_state->rulenum = s->st_rule->rulenum;
5729 ioc_id = &ioc_state->id;
5730 ioc_id->type = ETHERTYPE_IP;
5731 ioc_id->u.ip.proto = s->st_proto;
5732 ipfw_key_4tuple(&s->st_key,
5733 &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5734 &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5736 if (IPFW_ISXLAT(s->st_type)) {
5737 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5739 if (x->xlat_port == 0)
5740 ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5741 else
5742 ioc_state->xlat_port = ntohs(x->xlat_port);
5743 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5745 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5746 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5749 return (TRUE);
5752 static void
5753 ipfw_state_copy_dispatch(netmsg_t nmsg)
5755 struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5756 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5757 const struct ipfw_state *s;
5758 const struct ipfw_track *t;
5760 ASSERT_NETISR_NCPUS(mycpuid);
5761 KASSERT(nm->state_cnt < nm->state_cntmax,
5762 ("invalid state count %d, max %d",
5763 nm->state_cnt, nm->state_cntmax));
5765 TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5766 if (ipfw_state_copy(s, nm->ioc_state)) {
5767 nm->ioc_state++;
5768 nm->state_cnt++;
5769 if (nm->state_cnt == nm->state_cntmax)
5770 goto done;
5775 * Prepare tracks in the global track tree for userland.
5777 TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5778 struct ipfw_trkcnt *trk;
5780 if (t->t_count == NULL) /* anchor */
5781 continue;
5782 trk = t->t_trkcnt;
5785 * Only one netisr can run this function at
5786 * any time, and only this function accesses
5787 * trkcnt's tc_expire, so this is safe w/o
5788 * ipfw_gd.ipfw_trkcnt_token.
5790 if (trk->tc_expire > t->t_expire)
5791 continue;
5792 trk->tc_expire = t->t_expire;
5796 * Copy tracks in the global track tree to userland in
5797 * the last netisr.
5799 if (mycpuid == netisr_ncpus - 1) {
5800 struct ipfw_trkcnt *trk;
5802 KASSERT(nm->state_cnt < nm->state_cntmax,
5803 ("invalid state count %d, max %d",
5804 nm->state_cnt, nm->state_cntmax));
5806 IPFW_TRKCNT_TOKGET;
5807 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5808 if (ipfw_track_copy(trk, nm->ioc_state)) {
5809 nm->ioc_state++;
5810 nm->state_cnt++;
5811 if (nm->state_cnt == nm->state_cntmax) {
5812 IPFW_TRKCNT_TOKREL;
5813 goto done;
5817 IPFW_TRKCNT_TOKREL;
5819 done:
5820 if (nm->state_cnt == nm->state_cntmax) {
5821 /* No more space; done. */
5822 netisr_replymsg(&nm->base, 0);
5823 } else {
5824 netisr_forwardmsg(&nm->base, mycpuid + 1);
5828 static int
5829 ipfw_ctl_get_rules(struct sockopt *sopt)
5831 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5832 struct ip_fw *rule;
5833 void *bp;
5834 size_t size;
5835 int state_cnt;
5837 ASSERT_NETISR0;
5840 * pass up a copy of the current rules. Static rules
5841 * come first (the last of which has number IPFW_DEFAULT_RULE),
5842 * followed by a possibly empty list of states.
5845 size = static_ioc_len; /* size of static rules */
5848 * Size of the states.
5849 * XXX take tracks as state for userland compat.
5851 state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5852 state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5853 size += state_cnt * sizeof(struct ipfw_ioc_state);
5855 if (sopt->sopt_valsize < size) {
5856 /* short length, no need to return incomplete rules */
5857 /* XXX: if superuser, no need to zero buffer */
5858 bzero(sopt->sopt_val, sopt->sopt_valsize);
5859 return 0;
5861 bp = sopt->sopt_val;
5863 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5864 bp = ipfw_copy_rule(ctx, rule, bp);
5866 if (state_cnt) {
5867 struct netmsg_cpstate nm;
5868 #ifdef INVARIANTS
5869 size_t old_size = size;
5870 #endif
5872 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5873 MSGF_PRIORITY, ipfw_state_copy_dispatch);
5874 nm.ioc_state = bp;
5875 nm.state_cntmax = state_cnt;
5876 nm.state_cnt = 0;
5877 netisr_domsg_global(&nm.base);
5880 * The # of states may be shrinked after the snapshot
5881 * of the state count was taken. To give user a correct
5882 * state count, nm->state_cnt is used to recalculate
5883 * the actual size.
5885 size = static_ioc_len +
5886 (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5887 KKASSERT(size <= old_size);
5890 sopt->sopt_valsize = size;
5891 return 0;
5894 static void
5895 ipfw_set_disable_dispatch(netmsg_t nmsg)
5897 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5899 ASSERT_NETISR_NCPUS(mycpuid);
5901 ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5902 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5905 static void
5906 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5908 struct netmsg_base nmsg;
5909 uint32_t set_disable;
5911 ASSERT_NETISR0;
5913 /* IPFW_DEFAULT_SET is always enabled */
5914 enable |= (1 << IPFW_DEFAULT_SET);
5915 set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5917 bzero(&nmsg, sizeof(nmsg));
5918 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5919 ipfw_set_disable_dispatch);
5920 nmsg.lmsg.u.ms_result32 = set_disable;
5922 netisr_domsg_global(&nmsg);
5925 static void
5926 ipfw_table_create_dispatch(netmsg_t nm)
5928 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5929 int tblid = nm->lmsg.u.ms_result;
5931 ASSERT_NETISR_NCPUS(mycpuid);
5933 if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5934 rn_cpumaskhead(mycpuid), 32))
5935 panic("ipfw: create table%d failed", tblid);
5937 netisr_forwardmsg(&nm->base, mycpuid + 1);
5940 static int
5941 ipfw_table_create(struct sockopt *sopt)
5943 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5944 struct ipfw_ioc_table *tbl;
5945 struct netmsg_base nm;
5947 ASSERT_NETISR0;
5949 if (sopt->sopt_valsize != sizeof(*tbl))
5950 return (EINVAL);
5952 tbl = sopt->sopt_val;
5953 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5954 return (EINVAL);
5956 if (ctx->ipfw_tables[tbl->tableid] != NULL)
5957 return (EEXIST);
5959 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5960 ipfw_table_create_dispatch);
5961 nm.lmsg.u.ms_result = tbl->tableid;
5962 netisr_domsg_global(&nm);
5964 return (0);
5967 static void
5968 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5970 struct radix_node *ret;
5972 ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5973 if (ret != rn)
5974 panic("deleted other table entry");
5975 kfree(ret, M_IPFW);
5978 static int
5979 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5982 ipfw_table_killrn(xrnh, rn);
5983 return (0);
5986 static void
5987 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5988 int destroy)
5990 struct radix_node_head *rnh;
5992 ASSERT_NETISR_NCPUS(mycpuid);
5994 rnh = ctx->ipfw_tables[tableid];
5995 rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5996 if (destroy) {
5997 Free(rnh);
5998 ctx->ipfw_tables[tableid] = NULL;
6002 static void
6003 ipfw_table_flush_dispatch(netmsg_t nmsg)
6005 struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
6006 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6008 ASSERT_NETISR_NCPUS(mycpuid);
6010 ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
6011 netisr_forwardmsg(&nm->base, mycpuid + 1);
6014 static void
6015 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
6017 int i;
6019 ASSERT_NETISR_NCPUS(mycpuid);
6021 for (i = 0; i < ipfw_table_max; ++i) {
6022 if (ctx->ipfw_tables[i] != NULL)
6023 ipfw_table_flush_oncpu(ctx, i, destroy);
6027 static void
6028 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6030 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6032 ASSERT_NETISR_NCPUS(mycpuid);
6034 ipfw_table_flushall_oncpu(ctx, 0);
6035 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6038 static int
6039 ipfw_table_flush(struct sockopt *sopt)
6041 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6042 struct ipfw_ioc_table *tbl;
6043 struct netmsg_tblflush nm;
6045 ASSERT_NETISR0;
6047 if (sopt->sopt_valsize != sizeof(*tbl))
6048 return (EINVAL);
6050 tbl = sopt->sopt_val;
6051 if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6052 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6053 MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6054 netisr_domsg_global(&nm.base);
6055 return (0);
6058 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6059 return (EINVAL);
6061 if (ctx->ipfw_tables[tbl->tableid] == NULL)
6062 return (ENOENT);
6064 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6065 ipfw_table_flush_dispatch);
6066 nm.tableid = tbl->tableid;
6067 nm.destroy = 0;
6068 if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6069 nm.destroy = 1;
6070 netisr_domsg_global(&nm.base);
6072 return (0);
6075 static int
6076 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6078 int *cnt = xcnt;
6080 (*cnt)++;
6081 return (0);
6084 static int
6085 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6087 struct ipfw_table_cp *cp = xcp;
6088 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6089 struct ipfw_ioc_tblent *ioc_te;
6090 #ifdef INVARIANTS
6091 int cnt;
6092 #endif
6094 KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6095 cp->te_idx, cp->te_cnt));
6096 ioc_te = &cp->te[cp->te_idx];
6098 if (te->te_nodes->rn_mask != NULL) {
6099 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6100 *te->te_nodes->rn_mask);
6101 } else {
6102 ioc_te->netmask.sin_len = 0;
6104 memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6106 ioc_te->use = te->te_use;
6107 ioc_te->last_used = te->te_lastuse;
6108 #ifdef INVARIANTS
6109 cnt = 1;
6110 #endif
6112 while ((te = te->te_sibling) != NULL) {
6113 #ifdef INVARIANTS
6114 ++cnt;
6115 #endif
6116 ioc_te->use += te->te_use;
6117 if (te->te_lastuse > ioc_te->last_used)
6118 ioc_te->last_used = te->te_lastuse;
6120 KASSERT(cnt == netisr_ncpus,
6121 ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6123 cp->te_idx++;
6125 return (0);
6128 static int
6129 ipfw_table_get(struct sockopt *sopt)
6131 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6132 struct radix_node_head *rnh;
6133 struct ipfw_ioc_table *tbl;
6134 struct ipfw_ioc_tblcont *cont;
6135 struct ipfw_table_cp cp;
6136 int cnt = 0, sz;
6138 ASSERT_NETISR0;
6140 if (sopt->sopt_valsize < sizeof(*tbl))
6141 return (EINVAL);
6143 tbl = sopt->sopt_val;
6144 if (tbl->tableid < 0) {
6145 struct ipfw_ioc_tbllist *list;
6146 int i;
6149 * List available table ids.
6151 for (i = 0; i < ipfw_table_max; ++i) {
6152 if (ctx->ipfw_tables[i] != NULL)
6153 ++cnt;
6156 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6157 if (sopt->sopt_valsize < sz) {
6158 bzero(sopt->sopt_val, sopt->sopt_valsize);
6159 return (E2BIG);
6161 list = sopt->sopt_val;
6162 list->tablecnt = cnt;
6164 cnt = 0;
6165 for (i = 0; i < ipfw_table_max; ++i) {
6166 if (ctx->ipfw_tables[i] != NULL) {
6167 KASSERT(cnt < list->tablecnt,
6168 ("invalid idx %d, cnt %d",
6169 cnt, list->tablecnt));
6170 list->tables[cnt++] = i;
6173 sopt->sopt_valsize = sz;
6174 return (0);
6175 } else if (tbl->tableid >= ipfw_table_max) {
6176 return (EINVAL);
6179 rnh = ctx->ipfw_tables[tbl->tableid];
6180 if (rnh == NULL)
6181 return (ENOENT);
6182 rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6184 sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6185 if (sopt->sopt_valsize < sz) {
6186 bzero(sopt->sopt_val, sopt->sopt_valsize);
6187 return (E2BIG);
6189 cont = sopt->sopt_val;
6190 cont->entcnt = cnt;
6192 cp.te = cont->ent;
6193 cp.te_idx = 0;
6194 cp.te_cnt = cnt;
6195 rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6197 sopt->sopt_valsize = sz;
6198 return (0);
6201 static void
6202 ipfw_table_add_dispatch(netmsg_t nmsg)
6204 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6205 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6206 struct radix_node_head *rnh;
6207 struct ipfw_tblent *te;
6209 ASSERT_NETISR_NCPUS(mycpuid);
6211 rnh = ctx->ipfw_tables[nm->tableid];
6213 te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6214 te->te_nodes->rn_key = (char *)&te->te_key;
6215 memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6217 if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6218 te->te_nodes) == NULL) {
6219 if (mycpuid == 0) {
6220 kfree(te, M_IPFW);
6221 netisr_replymsg(&nm->base, EEXIST);
6222 return;
6224 panic("rnh_addaddr failed");
6227 /* Link siblings. */
6228 if (nm->sibling != NULL)
6229 nm->sibling->te_sibling = te;
6230 nm->sibling = te;
6232 netisr_forwardmsg(&nm->base, mycpuid + 1);
6235 static void
6236 ipfw_table_del_dispatch(netmsg_t nmsg)
6238 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6239 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6240 struct radix_node_head *rnh;
6241 struct radix_node *rn;
6243 ASSERT_NETISR_NCPUS(mycpuid);
6245 rnh = ctx->ipfw_tables[nm->tableid];
6246 rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6247 if (rn == NULL) {
6248 if (mycpuid == 0) {
6249 netisr_replymsg(&nm->base, ESRCH);
6250 return;
6252 panic("rnh_deladdr failed");
6254 kfree(rn, M_IPFW);
6256 netisr_forwardmsg(&nm->base, mycpuid + 1);
6259 static int
6260 ipfw_table_alt(struct sockopt *sopt)
6262 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6263 struct ipfw_ioc_tblcont *tbl;
6264 struct ipfw_ioc_tblent *te;
6265 struct sockaddr_in key0;
6266 struct sockaddr *netmask = NULL, *key;
6267 struct netmsg_tblent nm;
6269 ASSERT_NETISR0;
6271 if (sopt->sopt_valsize != sizeof(*tbl))
6272 return (EINVAL);
6273 tbl = sopt->sopt_val;
6275 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6276 return (EINVAL);
6277 if (tbl->entcnt != 1)
6278 return (EINVAL);
6280 if (ctx->ipfw_tables[tbl->tableid] == NULL)
6281 return (ENOENT);
6282 te = &tbl->ent[0];
6284 if (te->key.sin_family != AF_INET ||
6285 te->key.sin_port != 0 ||
6286 te->key.sin_len != sizeof(struct sockaddr_in))
6287 return (EINVAL);
6288 key = (struct sockaddr *)&te->key;
6290 if (te->netmask.sin_len != 0) {
6291 if (te->netmask.sin_port != 0 ||
6292 te->netmask.sin_len > sizeof(struct sockaddr_in))
6293 return (EINVAL);
6294 netmask = (struct sockaddr *)&te->netmask;
6295 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6296 key = (struct sockaddr *)&key0;
6299 if (sopt->sopt_name == IP_FW_TBL_ADD) {
6300 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6301 MSGF_PRIORITY, ipfw_table_add_dispatch);
6302 } else {
6303 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6304 MSGF_PRIORITY, ipfw_table_del_dispatch);
6306 nm.key = key;
6307 nm.netmask = netmask;
6308 nm.tableid = tbl->tableid;
6309 nm.sibling = NULL;
6310 return (netisr_domsg_global(&nm.base));
6313 static int
6314 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6316 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6318 te->te_use = 0;
6319 te->te_lastuse = 0;
6320 return (0);
6323 static void
6324 ipfw_table_zero_dispatch(netmsg_t nmsg)
6326 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6327 struct radix_node_head *rnh;
6329 ASSERT_NETISR_NCPUS(mycpuid);
6331 rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6332 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6334 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6337 static void
6338 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6340 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6341 int i;
6343 ASSERT_NETISR_NCPUS(mycpuid);
6345 for (i = 0; i < ipfw_table_max; ++i) {
6346 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6348 if (rnh != NULL)
6349 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6351 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6354 static int
6355 ipfw_table_zero(struct sockopt *sopt)
6357 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6358 struct netmsg_base nm;
6359 struct ipfw_ioc_table *tbl;
6361 ASSERT_NETISR0;
6363 if (sopt->sopt_valsize != sizeof(*tbl))
6364 return (EINVAL);
6365 tbl = sopt->sopt_val;
6367 if (tbl->tableid < 0) {
6368 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6369 ipfw_table_zeroall_dispatch);
6370 netisr_domsg_global(&nm);
6371 return (0);
6372 } else if (tbl->tableid >= ipfw_table_max) {
6373 return (EINVAL);
6374 } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6375 return (ENOENT);
6378 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6379 ipfw_table_zero_dispatch);
6380 nm.lmsg.u.ms_result = tbl->tableid;
6381 netisr_domsg_global(&nm);
6383 return (0);
6386 static int
6387 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6389 struct netmsg_tblexp *nm = xnm;
6390 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6392 if (te->te_expired) {
6393 ipfw_table_killrn(nm->rnh, rn);
6394 nm->expcnt++;
6396 return (0);
6399 static void
6400 ipfw_table_expire_dispatch(netmsg_t nmsg)
6402 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6403 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6404 struct radix_node_head *rnh;
6406 ASSERT_NETISR_NCPUS(mycpuid);
6408 rnh = ctx->ipfw_tables[nm->tableid];
6409 nm->rnh = rnh;
6410 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6412 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6413 ("not all expired addresses (%d) were deleted (%d)",
6414 nm->cnt * (mycpuid + 1), nm->expcnt));
6416 netisr_forwardmsg(&nm->base, mycpuid + 1);
6419 static void
6420 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6422 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6423 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6424 int i;
6426 ASSERT_NETISR_NCPUS(mycpuid);
6428 for (i = 0; i < ipfw_table_max; ++i) {
6429 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6431 if (rnh == NULL)
6432 continue;
6433 nm->rnh = rnh;
6434 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6437 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6438 ("not all expired addresses (%d) were deleted (%d)",
6439 nm->cnt * (mycpuid + 1), nm->expcnt));
6441 netisr_forwardmsg(&nm->base, mycpuid + 1);
6444 static int
6445 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6447 struct netmsg_tblexp *nm = xnm;
6448 struct ipfw_tblent *te;
6449 time_t lastuse;
6451 te = (struct ipfw_tblent *)rn;
6452 lastuse = te->te_lastuse;
6454 while ((te = te->te_sibling) != NULL) {
6455 if (te->te_lastuse > lastuse)
6456 lastuse = te->te_lastuse;
6458 if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6459 /* Not expired */
6460 return (0);
6463 te = (struct ipfw_tblent *)rn;
6464 te->te_expired = 1;
6465 while ((te = te->te_sibling) != NULL)
6466 te->te_expired = 1;
6467 nm->cnt++;
6469 return (0);
6472 static int
6473 ipfw_table_expire(struct sockopt *sopt)
6475 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6476 struct netmsg_tblexp nm;
6477 struct ipfw_ioc_tblexp *tbl;
6478 struct radix_node_head *rnh;
6480 ASSERT_NETISR0;
6482 if (sopt->sopt_valsize != sizeof(*tbl))
6483 return (EINVAL);
6484 tbl = sopt->sopt_val;
6485 tbl->expcnt = 0;
6487 nm.expcnt = 0;
6488 nm.cnt = 0;
6489 nm.expire = tbl->expire;
6491 if (tbl->tableid < 0) {
6492 int i;
6494 for (i = 0; i < ipfw_table_max; ++i) {
6495 rnh = ctx->ipfw_tables[i];
6496 if (rnh == NULL)
6497 continue;
6498 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6500 if (nm.cnt == 0) {
6501 /* No addresses can be expired. */
6502 return (0);
6504 tbl->expcnt = nm.cnt;
6506 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6507 MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6508 nm.tableid = -1;
6509 netisr_domsg_global(&nm.base);
6510 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6511 ("not all expired addresses (%d) were deleted (%d)",
6512 nm.cnt * netisr_ncpus, nm.expcnt));
6514 return (0);
6515 } else if (tbl->tableid >= ipfw_table_max) {
6516 return (EINVAL);
6519 rnh = ctx->ipfw_tables[tbl->tableid];
6520 if (rnh == NULL)
6521 return (ENOENT);
6522 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6523 if (nm.cnt == 0) {
6524 /* No addresses can be expired. */
6525 return (0);
6527 tbl->expcnt = nm.cnt;
6529 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6530 ipfw_table_expire_dispatch);
6531 nm.tableid = tbl->tableid;
6532 netisr_domsg_global(&nm.base);
6533 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6534 ("not all expired addresses (%d) were deleted (%d)",
6535 nm.cnt * netisr_ncpus, nm.expcnt));
6536 return (0);
6539 static void
6540 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6542 struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6544 KKASSERT((rule->rule_flags &
6545 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6546 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6547 ipfw_free_rule(rule);
6549 netisr_replymsg(&nmsg->base, 0);
6552 static void
6553 ipfw_crossref_reap(void)
6555 struct ip_fw *rule, *prev = NULL;
6557 ASSERT_NETISR0;
6559 rule = ipfw_gd.ipfw_crossref_free;
6560 while (rule != NULL) {
6561 uint64_t inflight = 0;
6562 int i;
6564 for (i = 0; i < netisr_ncpus; ++i)
6565 inflight += rule->cross_rules[i]->cross_refs;
6566 if (inflight == 0) {
6567 struct ip_fw *f = rule;
6570 * Unlink.
6572 rule = rule->next;
6573 if (prev != NULL)
6574 prev->next = rule;
6575 else
6576 ipfw_gd.ipfw_crossref_free = rule;
6579 * Free.
6581 for (i = 1; i < netisr_ncpus; ++i) {
6582 struct netmsg_base nm;
6584 netmsg_init(&nm, NULL, &curthread->td_msgport,
6585 MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6586 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6587 netisr_domsg(&nm, i);
6589 KKASSERT((f->rule_flags &
6590 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6591 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6592 ipfw_unref_rule(f);
6593 } else {
6594 prev = rule;
6595 rule = rule->next;
6599 if (ipfw_gd.ipfw_crossref_free != NULL) {
6600 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6601 ipfw_crossref_timeo, NULL);
6606 * {set|get}sockopt parser.
6608 static int
6609 ipfw_ctl(struct sockopt *sopt)
6611 int error, rulenum;
6612 uint32_t *masks;
6613 size_t size;
6615 ASSERT_NETISR0;
6617 error = 0;
6619 switch (sopt->sopt_name) {
6620 case IP_FW_GET:
6621 error = ipfw_ctl_get_rules(sopt);
6622 break;
6624 case IP_FW_FLUSH:
6625 ipfw_flush(0 /* keep default rule */);
6626 break;
6628 case IP_FW_ADD:
6629 error = ipfw_ctl_add_rule(sopt);
6630 break;
6632 case IP_FW_DEL:
6634 * IP_FW_DEL is used for deleting single rules or sets,
6635 * and (ab)used to atomically manipulate sets.
6636 * Argument size is used to distinguish between the two:
6637 * sizeof(uint32_t)
6638 * delete single rule or set of rules,
6639 * or reassign rules (or sets) to a different set.
6640 * 2 * sizeof(uint32_t)
6641 * atomic disable/enable sets.
6642 * first uint32_t contains sets to be disabled,
6643 * second uint32_t contains sets to be enabled.
6645 masks = sopt->sopt_val;
6646 size = sopt->sopt_valsize;
6647 if (size == sizeof(*masks)) {
6649 * Delete or reassign static rule
6651 error = ipfw_ctl_alter(masks[0]);
6652 } else if (size == (2 * sizeof(*masks))) {
6654 * Set enable/disable
6656 ipfw_ctl_set_disable(masks[0], masks[1]);
6657 } else {
6658 error = EINVAL;
6660 break;
6662 case IP_FW_ZERO:
6663 case IP_FW_RESETLOG: /* argument is an int, the rule number */
6664 rulenum = 0;
6666 if (sopt->sopt_val != 0) {
6667 error = soopt_to_kbuf(sopt, &rulenum,
6668 sizeof(int), sizeof(int));
6669 if (error)
6670 break;
6672 error = ipfw_ctl_zero_entry(rulenum,
6673 sopt->sopt_name == IP_FW_RESETLOG);
6674 break;
6676 case IP_FW_TBL_CREATE:
6677 error = ipfw_table_create(sopt);
6678 break;
6680 case IP_FW_TBL_ADD:
6681 case IP_FW_TBL_DEL:
6682 error = ipfw_table_alt(sopt);
6683 break;
6685 case IP_FW_TBL_FLUSH:
6686 case IP_FW_TBL_DESTROY:
6687 error = ipfw_table_flush(sopt);
6688 break;
6690 case IP_FW_TBL_GET:
6691 error = ipfw_table_get(sopt);
6692 break;
6694 case IP_FW_TBL_ZERO:
6695 error = ipfw_table_zero(sopt);
6696 break;
6698 case IP_FW_TBL_EXPIRE:
6699 error = ipfw_table_expire(sopt);
6700 break;
6702 default:
6703 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6704 error = EINVAL;
6707 ipfw_crossref_reap();
6708 return error;
6711 static void
6712 ipfw_keepalive_done(struct ipfw_context *ctx)
6715 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6716 ("keepalive is not in progress"));
6717 ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6718 callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6719 ipfw_keepalive, NULL);
6722 static void
6723 ipfw_keepalive_more(struct ipfw_context *ctx)
6725 struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6727 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6728 ("keepalive is not in progress"));
6729 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6730 ("keepalive more did not finish"));
6731 netisr_sendmsg_oncpu(nm);
6734 static void
6735 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6737 struct ipfw_state *s;
6738 int scanned = 0, expired = 0, kept = 0;
6740 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6741 ("keepalive is not in progress"));
6743 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6744 uint32_t ack_rev, ack_fwd;
6745 struct ipfw_flow_id id;
6746 uint8_t send_dir;
6748 if (scanned++ >= ipfw_state_scan_max) {
6749 ipfw_keepalive_more(ctx);
6750 return;
6753 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6754 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6757 * NOTE:
6758 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6759 * on slave xlat.
6761 if (s->st_type == O_ANCHOR)
6762 continue;
6764 if (IPFW_STATE_ISDEAD(s)) {
6765 ipfw_state_remove(ctx, s);
6766 if (++expired >= ipfw_state_expire_max) {
6767 ipfw_keepalive_more(ctx);
6768 return;
6770 continue;
6774 * Keep alive processing
6777 if (s->st_proto != IPPROTO_TCP)
6778 continue;
6779 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6780 continue;
6781 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6782 s->st_expire))
6783 continue; /* too early */
6785 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6786 &id.dst_ip, &id.dst_port);
6787 ack_rev = s->st_ack_rev;
6788 ack_fwd = s->st_ack_fwd;
6790 #define SEND_FWD 0x1
6791 #define SEND_REV 0x2
6793 if (IPFW_ISXLAT(s->st_type)) {
6794 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6796 if (x->xlat_dir == MATCH_FORWARD)
6797 send_dir = SEND_FWD;
6798 else
6799 send_dir = SEND_REV;
6800 } else {
6801 send_dir = SEND_FWD | SEND_REV;
6804 if (send_dir & SEND_REV)
6805 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6806 if (send_dir & SEND_FWD)
6807 send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6809 #undef SEND_FWD
6810 #undef SEND_REV
6812 if (++kept >= ipfw_keepalive_max) {
6813 ipfw_keepalive_more(ctx);
6814 return;
6817 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6818 ipfw_keepalive_done(ctx);
6821 static void
6822 ipfw_keepalive_more_dispatch(netmsg_t nm)
6824 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6825 struct ipfw_state *anchor;
6827 ASSERT_NETISR_NCPUS(mycpuid);
6828 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6829 ("keepalive is not in progress"));
6831 /* Reply ASAP */
6832 netisr_replymsg(&nm->base, 0);
6834 anchor = &ctx->ipfw_keepalive_anch;
6835 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6836 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6837 ipfw_keepalive_done(ctx);
6838 return;
6840 ipfw_keepalive_loop(ctx, anchor);
6844 * This procedure is only used to handle keepalives. It is invoked
6845 * every dyn_keepalive_period
6847 static void
6848 ipfw_keepalive_dispatch(netmsg_t nm)
6850 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6851 struct ipfw_state *anchor;
6853 ASSERT_NETISR_NCPUS(mycpuid);
6854 KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6855 ("keepalive is in progress"));
6856 ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6858 /* Reply ASAP */
6859 crit_enter();
6860 netisr_replymsg(&nm->base, 0);
6861 crit_exit();
6863 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6864 ipfw_keepalive_done(ctx);
6865 return;
6868 anchor = &ctx->ipfw_keepalive_anch;
6869 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6870 ipfw_keepalive_loop(ctx, anchor);
6874 * This procedure is only used to handle keepalives. It is invoked
6875 * every dyn_keepalive_period
6877 static void
6878 ipfw_keepalive(void *dummy __unused)
6880 struct netmsg_base *msg;
6882 KKASSERT(mycpuid < netisr_ncpus);
6883 msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6885 crit_enter();
6886 if (msg->lmsg.ms_flags & MSGF_DONE)
6887 netisr_sendmsg_oncpu(msg);
6888 crit_exit();
6891 static void
6892 ipfw_ip_input_dispatch(netmsg_t nmsg)
6894 struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6895 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6896 struct mbuf *m = nm->m;
6897 struct ip_fw *rule = nm->arg1;
6899 ASSERT_NETISR_NCPUS(mycpuid);
6900 KASSERT(rule->cpuid == mycpuid,
6901 ("rule does not belong to cpu%d", mycpuid));
6902 KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6903 ("mbuf does not have ipfw continue rule"));
6905 KASSERT(ctx->ipfw_cont_rule == NULL,
6906 ("pending ipfw continue rule"));
6907 ctx->ipfw_cont_rule = rule;
6908 ip_input(m);
6910 /* May not be cleared, if ipfw was unload/disabled. */
6911 ctx->ipfw_cont_rule = NULL;
6914 * This rule is no longer used; decrement its cross_refs,
6915 * so this rule can be deleted.
6917 rule->cross_refs--;
6920 static void
6921 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6923 struct netmsg_genpkt *nm;
6925 KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6928 * NOTE:
6929 * Bump cross_refs to prevent this rule and its siblings
6930 * from being deleted, while this mbuf is inflight. The
6931 * cross_refs of the sibling rule on the target cpu will
6932 * be decremented, once this mbuf is going to be filtered
6933 * on the target cpu.
6935 rule->cross_refs++;
6936 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6938 nm = &m->m_hdr.mh_genmsg;
6939 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6940 ipfw_ip_input_dispatch);
6941 nm->m = m;
6942 nm->arg1 = rule->cross_rules[cpuid];
6943 netisr_sendmsg(&nm->base, cpuid);
6946 static void
6947 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6950 args->flags = 0;
6951 args->rule = NULL;
6952 args->xlat = NULL;
6954 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6955 struct m_tag *mtag;
6957 /* Extract info from dummynet tag */
6958 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6959 KKASSERT(mtag != NULL);
6960 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6961 KKASSERT(args->rule != NULL);
6963 m_tag_delete(m, mtag);
6964 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6965 } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6966 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6968 KKASSERT(ctx->ipfw_cont_rule != NULL);
6969 args->rule = ctx->ipfw_cont_rule;
6970 ctx->ipfw_cont_rule = NULL;
6972 if (ctx->ipfw_cont_xlat != NULL) {
6973 args->xlat = ctx->ipfw_cont_xlat;
6974 ctx->ipfw_cont_xlat = NULL;
6975 if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6976 args->flags |= IP_FWARG_F_XLATINS;
6977 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6979 if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6980 args->flags |= IP_FWARG_F_XLATFWD;
6981 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6984 KKASSERT((m->m_pkthdr.fw_flags &
6985 (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6987 args->flags |= IP_FWARG_F_CONT;
6988 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6991 args->eh = NULL;
6992 args->oif = oif;
6993 args->m = m;
6996 static int
6997 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6999 struct ip_fw_args args;
7000 struct mbuf *m = *m0;
7001 int tee = 0, error = 0, ret;
7003 ipfw_init_args(&args, m, NULL);
7005 ret = ipfw_chk(&args);
7006 m = args.m;
7007 if (m == NULL) {
7008 if (ret != IP_FW_REDISPATCH)
7009 error = EACCES;
7010 goto back;
7013 switch (ret) {
7014 case IP_FW_PASS:
7015 break;
7017 case IP_FW_DENY:
7018 m_freem(m);
7019 m = NULL;
7020 error = EACCES;
7021 break;
7023 case IP_FW_DUMMYNET:
7024 /* Send packet to the appropriate pipe */
7025 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7026 break;
7028 case IP_FW_TEE:
7029 tee = 1;
7030 /* FALL THROUGH */
7032 case IP_FW_DIVERT:
7034 * Must clear bridge tag when changing
7036 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7037 if (ip_divert_p != NULL) {
7038 m = ip_divert_p(m, tee, 1);
7039 } else {
7040 m_freem(m);
7041 m = NULL;
7042 /* not sure this is the right error msg */
7043 error = EACCES;
7045 break;
7047 default:
7048 panic("unknown ipfw return value: %d", ret);
7050 back:
7051 *m0 = m;
7052 return error;
7055 static int
7056 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7058 struct ip_fw_args args;
7059 struct mbuf *m = *m0;
7060 int tee = 0, error = 0, ret;
7062 ipfw_init_args(&args, m, ifp);
7064 ret = ipfw_chk(&args);
7065 m = args.m;
7066 if (m == NULL) {
7067 if (ret != IP_FW_REDISPATCH)
7068 error = EACCES;
7069 goto back;
7072 switch (ret) {
7073 case IP_FW_PASS:
7074 break;
7076 case IP_FW_DENY:
7077 m_freem(m);
7078 m = NULL;
7079 error = EACCES;
7080 break;
7082 case IP_FW_DUMMYNET:
7083 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7084 break;
7086 case IP_FW_TEE:
7087 tee = 1;
7088 /* FALL THROUGH */
7090 case IP_FW_DIVERT:
7091 if (ip_divert_p != NULL) {
7092 m = ip_divert_p(m, tee, 0);
7093 } else {
7094 m_freem(m);
7095 m = NULL;
7096 /* not sure this is the right error msg */
7097 error = EACCES;
7099 break;
7101 default:
7102 panic("unknown ipfw return value: %d", ret);
7104 back:
7105 *m0 = m;
7106 return error;
7109 static void
7110 ipfw_hook(void)
7112 struct pfil_head *pfh;
7114 ASSERT_NETISR0;
7116 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7117 if (pfh == NULL)
7118 return;
7120 pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7121 pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7124 static void
7125 ipfw_dehook(void)
7127 struct pfil_head *pfh;
7129 ASSERT_NETISR0;
7131 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7132 if (pfh == NULL)
7133 return;
7135 pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7136 pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7139 static int
7140 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7142 int dyn_cnt;
7144 dyn_cnt = ipfw_state_cntcoll();
7145 dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7147 return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7150 static int
7151 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7153 int state_cnt;
7155 state_cnt = ipfw_state_cntcoll();
7156 return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7159 static int
7160 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7162 int state_max, error;
7164 state_max = ipfw_state_max;
7165 error = sysctl_handle_int(oidp, &state_max, 0, req);
7166 if (error || req->newptr == NULL)
7167 return (error);
7169 if (state_max < 1)
7170 return (EINVAL);
7172 ipfw_state_max_set(state_max);
7173 return (0);
7176 static int
7177 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7179 int dyn_max, error;
7181 dyn_max = ipfw_state_max + ipfw_track_max;
7183 error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7184 if (error || req->newptr == NULL)
7185 return (error);
7187 if (dyn_max < 2)
7188 return (EINVAL);
7190 ipfw_state_max_set(dyn_max / 2);
7191 ipfw_track_max = dyn_max / 2;
7192 return (0);
7195 static void
7196 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7198 int enable = nmsg->lmsg.u.ms_result;
7200 ASSERT_NETISR0;
7202 if (fw_enable == enable)
7203 goto reply;
7205 fw_enable = enable;
7206 if (fw_enable)
7207 ipfw_hook();
7208 else
7209 ipfw_dehook();
7210 reply:
7211 netisr_replymsg(&nmsg->base, 0);
7214 static int
7215 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7217 struct netmsg_base nmsg;
7218 int enable, error;
7220 enable = fw_enable;
7221 error = sysctl_handle_int(oidp, &enable, 0, req);
7222 if (error || req->newptr == NULL)
7223 return error;
7225 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7226 ipfw_sysctl_enable_dispatch);
7227 nmsg.lmsg.u.ms_result = enable;
7229 return netisr_domsg(&nmsg, 0);
7232 static int
7233 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7235 return sysctl_int_range(oidp, arg1, arg2, req,
7236 IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7239 static int
7240 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7243 return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7246 static int
7247 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7249 u_long stat = 0;
7250 int cpu, error;
7252 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7253 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7255 error = sysctl_handle_long(oidp, &stat, 0, req);
7256 if (error || req->newptr == NULL)
7257 return (error);
7259 /* Zero out this stat. */
7260 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7261 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7262 return (0);
7265 static void
7266 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7268 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7269 struct ipfw_context *ctx;
7270 struct ip_fw *def_rule;
7272 ASSERT_NETISR_NCPUS(mycpuid);
7274 ctx = kmalloc(__offsetof(struct ipfw_context,
7275 ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7277 RB_INIT(&ctx->ipfw_state_tree);
7278 TAILQ_INIT(&ctx->ipfw_state_list);
7280 RB_INIT(&ctx->ipfw_track_tree);
7281 TAILQ_INIT(&ctx->ipfw_track_list);
7283 callout_init_mp(&ctx->ipfw_stateto_ch);
7284 netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7285 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7286 ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7287 netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7288 MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7290 callout_init_mp(&ctx->ipfw_trackto_ch);
7291 netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7292 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7293 netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7294 MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7296 callout_init_mp(&ctx->ipfw_keepalive_ch);
7297 netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7298 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7299 ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7300 netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7301 MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7303 callout_init_mp(&ctx->ipfw_xlatreap_ch);
7304 netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7305 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7306 TAILQ_INIT(&ctx->ipfw_xlatreap);
7308 ipfw_ctx[mycpuid] = ctx;
7310 def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7312 def_rule->act_ofs = 0;
7313 def_rule->rulenum = IPFW_DEFAULT_RULE;
7314 def_rule->cmd_len = 1;
7315 def_rule->set = IPFW_DEFAULT_SET;
7317 def_rule->cmd[0].len = 1;
7318 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7319 def_rule->cmd[0].opcode = O_ACCEPT;
7320 #else
7321 if (filters_default_to_accept)
7322 def_rule->cmd[0].opcode = O_ACCEPT;
7323 else
7324 def_rule->cmd[0].opcode = O_DENY;
7325 #endif
7327 def_rule->refcnt = 1;
7328 def_rule->cpuid = mycpuid;
7330 /* Install the default rule */
7331 ctx->ipfw_default_rule = def_rule;
7332 ctx->ipfw_layer3_chain = def_rule;
7334 /* Link rule CPU sibling */
7335 ipfw_link_sibling(fwmsg, def_rule);
7337 /* Statistics only need to be updated once */
7338 if (mycpuid == 0)
7339 ipfw_inc_static_count(def_rule);
7341 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7344 static void
7345 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7348 crit_enter();
7349 /* Reply ASAP */
7350 netisr_replymsg(&nmsg->base, 0);
7351 crit_exit();
7352 ipfw_crossref_reap();
7355 static void
7356 ipfw_crossref_timeo(void *dummy __unused)
7358 struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7360 KKASSERT(mycpuid == 0);
7362 crit_enter();
7363 if (msg->lmsg.ms_flags & MSGF_DONE)
7364 netisr_sendmsg_oncpu(msg);
7365 crit_exit();
7368 static void
7369 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7371 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7372 struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7373 struct ip_fw *f;
7375 ASSERT_NETISR_NCPUS(mycpuid);
7377 for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7378 int l, cmdlen;
7379 ipfw_insn *cmd;
7381 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7382 continue;
7384 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7385 l -= cmdlen, cmd += cmdlen) {
7386 cmdlen = F_LEN(cmd);
7387 if (cmd->opcode == O_IP_SRC_IFIP ||
7388 cmd->opcode == O_IP_DST_IFIP) {
7389 if (strncmp(ifp->if_xname,
7390 ((ipfw_insn_ifip *)cmd)->ifname,
7391 IFNAMSIZ) == 0)
7392 cmd->arg1 &= ~IPFW_IFIP_VALID;
7396 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7399 static void
7400 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7401 enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7403 struct netmsg_base nm;
7405 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7406 ipfw_ifaddr_dispatch);
7407 nm.lmsg.u.ms_resultp = ifp;
7408 netisr_domsg_global(&nm);
7411 static void
7412 ipfw_init_dispatch(netmsg_t nmsg)
7414 struct netmsg_ipfw fwmsg;
7415 int error = 0, cpu;
7417 ASSERT_NETISR0;
7419 if (IPFW_LOADED) {
7420 kprintf("IP firewall already loaded\n");
7421 error = EEXIST;
7422 goto reply;
7425 if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7426 ipfw_table_max = UINT16_MAX;
7428 /* Initialize global track tree. */
7429 RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7430 IPFW_TRKCNT_TOKINIT;
7432 /* GC for freed crossref rules. */
7433 callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7434 netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7435 MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7437 ipfw_state_max_set(ipfw_state_max);
7438 ipfw_state_headroom = 8 * netisr_ncpus;
7440 bzero(&fwmsg, sizeof(fwmsg));
7441 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7442 ipfw_ctx_init_dispatch);
7443 netisr_domsg_global(&fwmsg.base);
7445 ip_fw_chk_ptr = ipfw_chk;
7446 ip_fw_ctl_ptr = ipfw_ctl;
7447 ip_fw_dn_io_ptr = ipfw_dummynet_io;
7449 kprintf("ipfw2 initialized, default to %s, logging ",
7450 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7451 O_ACCEPT ? "accept" : "deny");
7453 #ifdef IPFIREWALL_VERBOSE
7454 fw_verbose = 1;
7455 #endif
7456 #ifdef IPFIREWALL_VERBOSE_LIMIT
7457 verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7458 #endif
7459 if (fw_verbose == 0) {
7460 kprintf("disabled\n");
7461 } else if (verbose_limit == 0) {
7462 kprintf("unlimited\n");
7463 } else {
7464 kprintf("limited to %d packets/entry by default\n",
7465 verbose_limit);
7468 ip_fw_loaded = 1;
7469 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7470 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7471 ipfw_state_expire_ipifunc, NULL, cpu);
7472 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7473 ipfw_track_expire_ipifunc, NULL, cpu);
7474 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7475 ipfw_keepalive, NULL, cpu);
7478 if (fw_enable)
7479 ipfw_hook();
7481 ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7482 NULL, EVENTHANDLER_PRI_ANY);
7483 if (ipfw_ifaddr_event == NULL)
7484 kprintf("ipfw: ifaddr_event register failed\n");
7486 reply:
7487 netisr_replymsg(&nmsg->base, error);
7490 static int
7491 ipfw_init(void)
7493 struct netmsg_base smsg;
7495 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7496 ipfw_init_dispatch);
7497 return netisr_domsg(&smsg, 0);
7500 #ifdef KLD_MODULE
7502 static void
7503 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7505 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7507 ASSERT_NETISR_NCPUS(mycpuid);
7509 callout_cancel(&ctx->ipfw_stateto_ch);
7510 callout_cancel(&ctx->ipfw_trackto_ch);
7511 callout_cancel(&ctx->ipfw_keepalive_ch);
7512 callout_cancel(&ctx->ipfw_xlatreap_ch);
7514 crit_enter();
7515 netisr_dropmsg(&ctx->ipfw_stateexp_more);
7516 netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7517 netisr_dropmsg(&ctx->ipfw_trackexp_more);
7518 netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7519 netisr_dropmsg(&ctx->ipfw_keepalive_more);
7520 netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7521 netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7522 crit_exit();
7524 ipfw_table_flushall_oncpu(ctx, 1);
7526 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7529 static void
7530 ipfw_fini_dispatch(netmsg_t nmsg)
7532 struct netmsg_base nm;
7533 int error = 0, cpu;
7535 ASSERT_NETISR0;
7537 ipfw_crossref_reap();
7539 if (ipfw_gd.ipfw_refcnt != 0) {
7540 error = EBUSY;
7541 goto reply;
7544 ip_fw_loaded = 0;
7545 ipfw_dehook();
7547 /* Synchronize any inflight state/track expire IPIs. */
7548 lwkt_synchronize_ipiqs("ipfwfini");
7550 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7551 ipfw_ctx_fini_dispatch);
7552 netisr_domsg_global(&nm);
7554 callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7555 crit_enter();
7556 netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7557 crit_exit();
7559 if (ipfw_ifaddr_event != NULL)
7560 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7562 ip_fw_chk_ptr = NULL;
7563 ip_fw_ctl_ptr = NULL;
7564 ip_fw_dn_io_ptr = NULL;
7565 ipfw_flush(1 /* kill default rule */);
7567 /* Free pre-cpu context */
7568 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7569 kfree(ipfw_ctx[cpu], M_IPFW);
7571 kprintf("IP firewall unloaded\n");
7572 reply:
7573 netisr_replymsg(&nmsg->base, error);
7576 static void
7577 ipfw_fflush_dispatch(netmsg_t nmsg)
7580 ipfw_flush(0 /* keep default rule */);
7581 ipfw_crossref_reap();
7582 netisr_replymsg(&nmsg->base, 0);
7585 static int
7586 ipfw_fini(void)
7588 struct netmsg_base smsg;
7589 int i = 0;
7591 for (;;) {
7592 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7593 ipfw_fflush_dispatch);
7594 netisr_domsg(&smsg, 0);
7596 if (ipfw_gd.ipfw_refcnt == 0)
7597 break;
7598 kprintf("ipfw: flush pending %d\n", ++i);
7599 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7602 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7603 ipfw_fini_dispatch);
7604 return netisr_domsg(&smsg, 0);
7607 #endif /* KLD_MODULE */
7609 static int
7610 ipfw_modevent(module_t mod, int type, void *unused)
7612 int err = 0;
7614 switch (type) {
7615 case MOD_LOAD:
7616 err = ipfw_init();
7617 break;
7619 case MOD_UNLOAD:
7620 #ifndef KLD_MODULE
7621 kprintf("ipfw statically compiled, cannot unload\n");
7622 err = EBUSY;
7623 #else
7624 err = ipfw_fini();
7625 #endif
7626 break;
7627 default:
7628 break;
7630 return err;
7633 static moduledata_t ipfwmod = {
7634 "ipfw",
7635 ipfw_modevent,
7638 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7639 MODULE_VERSION(ipfw, 1);