ipfw: Add interface network filter.
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
blob6d6459a47ef997979ce92ff5c0245dbd0b3913d5
1 /*
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
25 * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
29 * Implement IP packet firewall (new version)
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
80 #include <net/ipfw/ip_fw2.h>
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85 if (fw_debug > 0) \
86 kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...) ((void)0)
90 #endif
93 * Description about per-CPU rule duplication:
95 * Module loading/unloading and all ioctl operations are serialized
96 * by netisr0, so we don't have any ordering or locking problems.
98 * Following graph shows how operation on per-CPU rule list is
99 * performed [2 CPU case]:
101 * CPU0 CPU1
103 * netisr0 <------------------------------------+
104 * domsg |
105 * : |
106 * :(delete/add...) |
107 * : |
108 * : netmsg | netmsg
109 * forwardmsg---------->netisr1 |
110 * : |
111 * :(delete/add...) |
112 * : |
113 * : |
114 * replymsg--------------+
118 * Rule structure [2 CPU case]
120 * CPU0 CPU1
122 * layer3_chain layer3_chain
123 * | |
124 * V V
125 * +-------+ sibling +-------+ sibling
126 * | rule1 |--------->| rule1 |--------->NULL
127 * +-------+ +-------+
128 * | |
129 * |next |next
130 * V V
131 * +-------+ sibling +-------+ sibling
132 * | rule2 |--------->| rule2 |--------->NULL
133 * +-------+ +-------+
135 * ip_fw.sibling:
136 * 1) Ease statistics calculation during IP_FW_GET. We only need to
137 * iterate layer3_chain in netisr0; the current rule's duplication
138 * to the other CPUs could safely be read-only accessed through
139 * ip_fw.sibling.
140 * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141 * a) In netisr0 rule3 is determined to be inserted between rule1
142 * and rule2. To make this decision we need to iterate the
143 * layer3_chain in netisr0. The netmsg, which is used to insert
144 * the rule, will contain rule1 in netisr0 as prev_rule and rule2
145 * in netisr0 as next_rule.
146 * b) After the insertion in netisr0 is done, we will move on to
147 * netisr1. But instead of relocating the rule3's position in
148 * netisr1 by iterating the layer3_chain in netisr1, we set the
149 * netmsg's prev_rule to rule1->sibling and next_rule to
150 * rule2->sibling before the netmsg is forwarded to netisr1 from
151 * netisr0.
155 * Description of states and tracks.
157 * Both states and tracks are stored in per-cpu RB trees instead of
158 * per-cpu hash tables to avoid the worst case hash degeneration.
160 * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161 * measured in seconds and depending on the flags.
163 * When a packet is received, its address fields are first masked with
164 * the mask defined for the rule, then matched against the entries in
165 * the per-cpu state RB tree. States are generated by 'keep-state'
166 * and 'limit' options.
168 * The max number of states is ipfw_state_max. When we reach the
169 * maximum number of states we do not create anymore. This is done to
170 * avoid consuming too much memory, but also too much time when
171 * searching on each packet.
173 * Each state holds a pointer to the parent ipfw rule of the current
174 * CPU so we know what action to perform. States are removed when the
175 * parent rule is deleted. XXX we should make them survive.
177 * There are some limitations with states -- we do not obey the
178 * 'randomized match', and we do not do multiple passes through the
179 * firewall. XXX check the latter!!!
181 * States grow independently on each CPU, e.g. 2 CPU case:
183 * CPU0 CPU1
184 * ................... ...................
185 * : state RB tree : : state RB tree :
186 * : : : :
187 * : state1 state2 : : state3 :
188 * : | | : : | :
189 * :.....|....|......: :........|........:
190 * | | |
191 * | | |st_rule
192 * | | |
193 * V V V
194 * +-------+ +-------+
195 * | rule1 | | rule1 |
196 * +-------+ +-------+
198 * Tracks are used to enforce limits on the number of sessions. Tracks
199 * are generated by 'limit' option.
201 * The max number of tracks is ipfw_track_max. When we reach the
202 * maximum number of tracks we do not create anymore. This is done to
203 * avoid consuming too much memory.
205 * Tracks are organized into two layers, track counter RB tree is
206 * shared between CPUs, track RB tree is per-cpu. States generated by
207 * 'limit' option are linked to the track in addition to the per-cpu
208 * state RB tree; mainly to ease expiration. e.g. 2 CPU case:
210 * ..............................
211 * : track counter RB tree :
212 * : :
213 * : +-----------+ :
214 * : | trkcnt1 | :
215 * : | | :
216 * : +--->counter<----+ :
217 * : | | | | :
218 * : | +-----------+ | :
219 * :......|................|....:
220 * | |
221 * CPU0 | | CPU1
222 * ................. |t_count | .................
223 * : track RB tree : | | : track RB tree :
224 * : : | | : :
225 * : +-->track1-------+ +--------track2 :
226 * : | A : : :
227 * : | | : : :
228 * :.|.....|.......: :...............:
229 * | +----------------+
230 * | .................... |
231 * | : state RB tree : |st_track
232 * | : : |
233 * +---state1 state2---+
234 * : | | :
235 * :.....|.......|....:
236 * | |
237 * | |st_rule
238 * V V
239 * +----------+
240 * | rule1 |
241 * +----------+
244 #define IPFW_AUTOINC_STEP_MIN 1
245 #define IPFW_AUTOINC_STEP_MAX 1000
246 #define IPFW_AUTOINC_STEP_DEF 100
248 #define IPFW_TABLE_MAX_DEF 64
250 #define IPFW_DEFAULT_RULE 65535 /* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET 31 /* set number for the default rule */
253 #define MATCH_REVERSE 0
254 #define MATCH_FORWARD 1
255 #define MATCH_NONE 2
256 #define MATCH_UNKNOWN 3
258 #define IPFW_STATE_TCPFLAGS (TH_SYN | TH_FIN | TH_RST)
259 #define IPFW_STATE_TCPSTATES (IPFW_STATE_TCPFLAGS | \
260 (IPFW_STATE_TCPFLAGS << 8))
262 #define BOTH_SYN (TH_SYN | (TH_SYN << 8))
263 #define BOTH_FIN (TH_FIN | (TH_FIN << 8))
264 #define BOTH_RST (TH_RST | (TH_RST << 8))
265 /* TH_ACK here means FIN was ACKed. */
266 #define BOTH_FINACK (TH_ACK | (TH_ACK << 8))
268 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP && \
269 (((s)->st_state & BOTH_RST) || \
270 ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
272 #define O_ANCHOR O_NOP
274 struct netmsg_ipfw {
275 struct netmsg_base base;
276 const struct ipfw_ioc_rule *ioc_rule;
277 struct ip_fw *next_rule;
278 struct ip_fw *prev_rule;
279 struct ip_fw *sibling;
280 uint32_t rule_flags;
281 struct ip_fw **cross_rules;
284 struct netmsg_del {
285 struct netmsg_base base;
286 struct ip_fw *start_rule;
287 struct ip_fw *prev_rule;
288 uint16_t rulenum;
289 uint8_t from_set;
290 uint8_t to_set;
293 struct netmsg_zent {
294 struct netmsg_base base;
295 struct ip_fw *start_rule;
296 uint16_t rulenum;
297 uint16_t log_only;
300 struct netmsg_cpstate {
301 struct netmsg_base base;
302 struct ipfw_ioc_state *ioc_state;
303 int state_cntmax;
304 int state_cnt;
307 struct netmsg_tblent {
308 struct netmsg_base base;
309 struct sockaddr *key;
310 struct sockaddr *netmask;
311 struct ipfw_tblent *sibling;
312 int tableid;
315 struct netmsg_tblflush {
316 struct netmsg_base base;
317 int tableid;
318 int destroy;
321 struct netmsg_tblexp {
322 struct netmsg_base base;
323 time_t expire;
324 int tableid;
325 int cnt;
326 int expcnt;
327 struct radix_node_head *rnh;
330 struct ipfw_table_cp {
331 struct ipfw_ioc_tblent *te;
332 int te_idx;
333 int te_cnt;
336 struct ip_fw_local {
338 * offset The offset of a fragment. offset != 0 means that
339 * we have a fragment at this offset of an IPv4 packet.
340 * offset == 0 means that (if this is an IPv4 packet)
341 * this is the first or only fragment.
343 u_short offset;
346 * Local copies of addresses. They are only valid if we have
347 * an IP packet.
349 * proto The protocol. Set to 0 for non-ip packets,
350 * or to the protocol read from the packet otherwise.
351 * proto != 0 means that we have an IPv4 packet.
353 * src_port, dst_port port numbers, in HOST format. Only
354 * valid for TCP and UDP packets.
356 * src_ip, dst_ip ip addresses, in NETWORK format.
357 * Only valid for IPv4 packets.
359 uint8_t proto;
360 uint16_t src_port; /* NOTE: host format */
361 uint16_t dst_port; /* NOTE: host format */
362 struct in_addr src_ip; /* NOTE: network format */
363 struct in_addr dst_ip; /* NOTE: network format */
364 uint16_t ip_len;
367 struct ipfw_addrs {
368 uint32_t addr1;
369 uint32_t addr2;
372 struct ipfw_ports {
373 uint16_t port1;
374 uint16_t port2;
377 struct ipfw_key {
378 union {
379 struct ipfw_addrs addrs;
380 uint64_t value;
381 } addr_u;
382 union {
383 struct ipfw_ports ports;
384 uint32_t value;
385 } port_u;
386 uint8_t proto;
387 uint8_t swap; /* IPFW_KEY_SWAP_ */
388 uint16_t rsvd2;
391 #define IPFW_KEY_SWAP_ADDRS 0x1
392 #define IPFW_KEY_SWAP_PORTS 0x2
393 #define IPFW_KEY_SWAP_ALL (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
395 struct ipfw_trkcnt {
396 RB_ENTRY(ipfw_trkcnt) tc_rblink;
397 struct ipfw_key tc_key;
398 uintptr_t tc_ruleid;
399 int tc_refs;
400 int tc_count;
401 time_t tc_expire; /* userland get-only */
402 uint16_t tc_rulenum; /* userland get-only */
403 } __cachealign;
405 #define tc_addrs tc_key.addr_u.value
406 #define tc_ports tc_key.port_u.value
407 #define tc_proto tc_key.proto
408 #define tc_saddr tc_key.addr_u.addrs.addr1
409 #define tc_daddr tc_key.addr_u.addrs.addr2
410 #define tc_sport tc_key.port_u.ports.port1
411 #define tc_dport tc_key.port_u.ports.port2
413 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
415 struct ipfw_state;
417 struct ipfw_track {
418 RB_ENTRY(ipfw_track) t_rblink;
419 struct ipfw_key t_key;
420 struct ip_fw *t_rule;
421 time_t t_lastexp;
422 LIST_HEAD(, ipfw_state) t_state_list;
423 time_t t_expire;
424 volatile int *t_count;
425 struct ipfw_trkcnt *t_trkcnt;
426 TAILQ_ENTRY(ipfw_track) t_link;
429 #define t_addrs t_key.addr_u.value
430 #define t_ports t_key.port_u.value
431 #define t_proto t_key.proto
432 #define t_saddr t_key.addr_u.addrs.addr1
433 #define t_daddr t_key.addr_u.addrs.addr2
434 #define t_sport t_key.port_u.ports.port1
435 #define t_dport t_key.port_u.ports.port2
437 RB_HEAD(ipfw_track_tree, ipfw_track);
438 TAILQ_HEAD(ipfw_track_list, ipfw_track);
440 struct ipfw_state {
441 RB_ENTRY(ipfw_state) st_rblink;
442 struct ipfw_key st_key;
444 time_t st_expire; /* expire time */
445 struct ip_fw *st_rule;
447 uint64_t st_pcnt; /* packets */
448 uint64_t st_bcnt; /* bytes */
451 * st_state:
452 * State of this rule, typically a combination of TCP flags.
454 * st_ack_fwd/st_ack_rev:
455 * Most recent ACKs in forward and reverse direction. They
456 * are used to generate keepalives.
458 uint32_t st_state;
459 uint32_t st_ack_fwd;
460 uint32_t st_seq_fwd;
461 uint32_t st_ack_rev;
462 uint32_t st_seq_rev;
464 uint16_t st_flags; /* IPFW_STATE_F_ */
465 uint16_t st_type; /* O_KEEP_STATE/O_LIMIT */
466 struct ipfw_track *st_track;
468 LIST_ENTRY(ipfw_state) st_trklink;
469 TAILQ_ENTRY(ipfw_state) st_link;
472 #define st_addrs st_key.addr_u.value
473 #define st_ports st_key.port_u.value
474 #define st_proto st_key.proto
475 #define st_swap st_key.swap
477 #define IPFW_STATE_F_ACKFWD 0x0001
478 #define IPFW_STATE_F_SEQFWD 0x0002
479 #define IPFW_STATE_F_ACKREV 0x0004
480 #define IPFW_STATE_F_SEQREV 0x0008
482 TAILQ_HEAD(ipfw_state_list, ipfw_state);
483 RB_HEAD(ipfw_state_tree, ipfw_state);
485 struct ipfw_tblent {
486 struct radix_node te_nodes[2];
487 struct sockaddr_in te_key;
488 u_long te_use;
489 time_t te_lastuse;
490 struct ipfw_tblent *te_sibling;
491 volatile int te_expired;
494 struct ipfw_context {
495 struct ip_fw *ipfw_layer3_chain; /* rules for layer3 */
496 struct ip_fw *ipfw_default_rule; /* default rule */
497 uint64_t ipfw_norule_counter; /* ipfw_log(NULL) stat*/
500 * ipfw_set_disable contains one bit per set value (0..31).
501 * If the bit is set, all rules with the corresponding set
502 * are disabled. Set IPDW_DEFAULT_SET is reserved for the
503 * default rule and CANNOT be disabled.
505 uint32_t ipfw_set_disable;
507 uint8_t ipfw_flags; /* IPFW_FLAG_ */
509 struct ip_fw *ipfw_cont_rule;
511 struct ipfw_state_tree ipfw_state_tree;
512 struct ipfw_state_list ipfw_state_list;
513 int ipfw_state_loosecnt;
514 int ipfw_state_cnt;
516 union {
517 struct ipfw_state state;
518 struct ipfw_track track;
519 struct ipfw_trkcnt trkcnt;
520 } ipfw_tmpkey;
522 struct ipfw_track_tree ipfw_track_tree;
523 struct ipfw_track_list ipfw_track_list;
524 struct ipfw_trkcnt *ipfw_trkcnt_spare;
526 struct callout ipfw_stateto_ch;
527 time_t ipfw_state_lastexp;
528 struct netmsg_base ipfw_stateexp_nm;
529 struct netmsg_base ipfw_stateexp_more;
530 struct ipfw_state ipfw_stateexp_anch;
532 struct callout ipfw_trackto_ch;
533 time_t ipfw_track_lastexp;
534 struct netmsg_base ipfw_trackexp_nm;
535 struct netmsg_base ipfw_trackexp_more;
536 struct ipfw_track ipfw_trackexp_anch;
538 struct callout ipfw_keepalive_ch;
539 struct netmsg_base ipfw_keepalive_nm;
540 struct netmsg_base ipfw_keepalive_more;
541 struct ipfw_state ipfw_keepalive_anch;
544 * Statistics
546 u_long ipfw_sts_reap;
547 u_long ipfw_sts_reapfailed;
548 u_long ipfw_sts_overflow;
549 u_long ipfw_sts_nomem;
550 u_long ipfw_sts_tcprecycled;
552 u_long ipfw_tks_nomem;
553 u_long ipfw_tks_reap;
554 u_long ipfw_tks_reapfailed;
555 u_long ipfw_tks_overflow;
556 u_long ipfw_tks_cntnomem;
558 u_long ipfw_frags;
559 u_long ipfw_defraged;
560 u_long ipfw_defrag_remote;
562 /* Last field */
563 struct radix_node_head *ipfw_tables[];
566 #define IPFW_FLAG_KEEPALIVE 0x01
567 #define IPFW_FLAG_STATEEXP 0x02
568 #define IPFW_FLAG_TRACKEXP 0x04
569 #define IPFW_FLAG_STATEREAP 0x08
570 #define IPFW_FLAG_TRACKREAP 0x10
572 #define ipfw_state_tmpkey ipfw_tmpkey.state
573 #define ipfw_track_tmpkey ipfw_tmpkey.track
574 #define ipfw_trkcnt_tmpkey ipfw_tmpkey.trkcnt
576 struct ipfw_global {
577 int ipfw_state_loosecnt; /* cache aligned */
578 time_t ipfw_state_globexp __cachealign;
580 struct lwkt_token ipfw_trkcnt_token __cachealign;
581 struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
582 int ipfw_trkcnt_cnt;
583 time_t ipfw_track_globexp;
585 /* Accessed in netisr0. */
586 struct ip_fw *ipfw_crossref_free __cachealign;
587 struct callout ipfw_crossref_ch;
588 struct netmsg_base ipfw_crossref_nm;
590 #ifdef KLD_MODULE
592 * Module can not be unloaded, if there are references to
593 * certains rules of ipfw(4), e.g. dummynet(4)
595 int ipfw_refcnt __cachealign;
596 #endif
597 } __cachealign;
599 static struct ipfw_context *ipfw_ctx[MAXCPU];
601 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
604 * Following two global variables are accessed and updated only
605 * in netisr0.
607 static uint32_t static_count; /* # of static rules */
608 static uint32_t static_ioc_len; /* bytes of static rules */
611 * If 1, then ipfw static rules are being flushed,
612 * ipfw_chk() will skip to the default rule.
614 static int ipfw_flushing;
616 static int fw_verbose;
617 static int verbose_limit;
619 static int fw_debug;
620 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
622 static int ipfw_table_max = IPFW_TABLE_MAX_DEF;
624 static int ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
625 static int ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
627 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
629 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
630 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
631 "Firewall statistics");
633 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
634 &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
635 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
636 &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
637 "Rule number autincrement step");
638 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
639 &fw_one_pass, 0,
640 "Only do a single pass through ipfw when using dummynet(4)");
641 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
642 &fw_debug, 0, "Enable printing of debug ip_fw statements");
643 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
644 &fw_verbose, 0, "Log matches to ipfw rules");
645 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
646 &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
647 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
648 &ipfw_table_max, 0, "Max # of tables");
650 static int ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
651 static int ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
652 static int ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
653 static int ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
654 static int ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
655 static int ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
658 * Timeouts for various events in handing states.
660 * NOTE:
661 * 1 == 0~1 second.
662 * 2 == 1~2 second(s).
664 * We use 2 seconds for FIN lifetime, so that the states will not be
665 * ripped prematurely.
667 static uint32_t dyn_ack_lifetime = 300;
668 static uint32_t dyn_syn_lifetime = 20;
669 static uint32_t dyn_finwait_lifetime = 20;
670 static uint32_t dyn_fin_lifetime = 2;
671 static uint32_t dyn_rst_lifetime = 2;
672 static uint32_t dyn_udp_lifetime = 10;
673 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
676 * Keepalives are sent if dyn_keepalive is set. They are sent every
677 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
678 * seconds of lifetime of a rule.
680 static uint32_t dyn_keepalive_interval = 20;
681 static uint32_t dyn_keepalive_period = 5;
682 static uint32_t dyn_keepalive = 1; /* do send keepalives */
684 static struct ipfw_global ipfw_gd;
685 static int ipfw_state_loosecnt_updthr;
686 static int ipfw_state_max = 4096; /* max # of states */
687 static int ipfw_track_max = 4096; /* max # of tracks */
689 static int ipfw_state_headroom; /* setup at module load time */
690 static int ipfw_state_reap_min = 8;
691 static int ipfw_state_expire_max = 32;
692 static int ipfw_state_scan_max = 256;
693 static int ipfw_keepalive_max = 8;
694 static int ipfw_track_reap_max = 4;
695 static int ipfw_track_expire_max = 16;
696 static int ipfw_track_scan_max = 128;
698 static eventhandler_tag ipfw_ifaddr_event;
700 /* Compat */
701 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
702 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
703 "Number of states and tracks");
704 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
705 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
706 "Max number of states and tracks");
708 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
709 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
710 "Number of states");
711 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
712 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
713 "Max number of states");
714 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
715 &ipfw_state_headroom, 0, "headroom for state reap");
716 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
717 &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
718 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
719 &ipfw_track_max, 0, "Max number of tracks");
720 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
721 &static_count, 0, "Number of static rules");
722 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
723 &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
724 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
725 &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
726 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
727 &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
728 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
729 &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
730 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
731 &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
732 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
733 &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
734 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
735 &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
736 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
737 &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
738 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
739 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
740 "I", "# of states to scan for each expire iteration");
741 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
742 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
743 "I", "# of states to expire for each expire iteration");
744 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
745 CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
746 "I", "# of states to expire for each expire iteration");
747 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
748 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
749 "I", "# of states to reap for state shortage");
750 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
751 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
752 "I", "# of tracks to scan for each expire iteration");
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
754 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
755 "I", "# of tracks to expire for each expire iteration");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
757 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
758 "I", "# of tracks to reap for track shortage");
760 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
761 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
762 __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
763 "LU", "# of state reaps due to states shortage");
764 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
765 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
766 __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
767 "LU", "# of state reap failure");
768 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
769 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
770 __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
771 "LU", "# of state overflow");
772 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
773 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
774 __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
775 "LU", "# of state allocation failure");
776 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
777 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
778 __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
779 "LU", "# of state deleted due to fast TCP port recycling");
781 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
782 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
783 __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
784 "LU", "# of track allocation failure");
785 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
786 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
787 __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
788 "LU", "# of track reap due to tracks shortage");
789 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
790 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
791 __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
792 "LU", "# of track reap failure");
793 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
794 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
795 __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
796 "LU", "# of track overflow");
797 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
798 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
799 __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
800 "LU", "# of track counter allocation failure");
801 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
802 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
803 __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
804 "LU", "# of IP fragements defraged");
805 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
806 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
807 __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
808 "LU", "# of IP packets after defrag");
809 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
810 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
811 __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
812 "LU", "# of IP packets after defrag dispatched to remote cpus");
814 static int ipfw_state_cmp(struct ipfw_state *,
815 struct ipfw_state *);
816 static int ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
817 struct ipfw_trkcnt *);
818 static int ipfw_track_cmp(struct ipfw_track *,
819 struct ipfw_track *);
821 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
822 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
824 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
825 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
827 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
828 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
830 static ip_fw_chk_t ipfw_chk;
831 static void ipfw_track_expire_ipifunc(void *);
832 static void ipfw_state_expire_ipifunc(void *);
833 static void ipfw_keepalive(void *);
834 static int ipfw_state_expire_start(struct ipfw_context *,
835 int, int);
836 static void ipfw_crossref_timeo(void *);
838 #define IPFW_TRKCNT_TOKGET lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
839 #define IPFW_TRKCNT_TOKREL lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
840 #define IPFW_TRKCNT_TOKINIT \
841 lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
843 static void
844 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
845 const struct sockaddr *netmask)
847 const u_char *cp1 = (const u_char *)src;
848 u_char *cp2 = (u_char *)dst;
849 const u_char *cp3 = (const u_char *)netmask;
850 u_char *cplim = cp2 + *cp3;
851 u_char *cplim2 = cp2 + *cp1;
853 *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
854 cp3 += 2;
855 if (cplim > cplim2)
856 cplim = cplim2;
857 while (cp2 < cplim)
858 *cp2++ = *cp1++ & *cp3++;
859 if (cp2 < cplim2)
860 bzero(cp2, cplim2 - cp2);
863 static __inline void
864 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
865 in_addr_t daddr, uint16_t dport, uint8_t proto)
868 key->proto = proto;
869 key->swap = 0;
871 if (saddr < daddr) {
872 key->addr_u.addrs.addr1 = daddr;
873 key->addr_u.addrs.addr2 = saddr;
874 key->swap |= IPFW_KEY_SWAP_ADDRS;
875 } else {
876 key->addr_u.addrs.addr1 = saddr;
877 key->addr_u.addrs.addr2 = daddr;
880 if (sport < dport) {
881 key->port_u.ports.port1 = dport;
882 key->port_u.ports.port2 = sport;
883 key->swap |= IPFW_KEY_SWAP_PORTS;
884 } else {
885 key->port_u.ports.port1 = sport;
886 key->port_u.ports.port2 = dport;
889 if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
890 key->swap |= IPFW_KEY_SWAP_PORTS;
891 if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
892 key->swap |= IPFW_KEY_SWAP_ADDRS;
895 static __inline void
896 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
897 in_addr_t *daddr, uint16_t *dport)
900 if (key->swap & IPFW_KEY_SWAP_ADDRS) {
901 *saddr = key->addr_u.addrs.addr2;
902 *daddr = key->addr_u.addrs.addr1;
903 } else {
904 *saddr = key->addr_u.addrs.addr1;
905 *daddr = key->addr_u.addrs.addr2;
908 if (key->swap & IPFW_KEY_SWAP_PORTS) {
909 *sport = key->port_u.ports.port2;
910 *dport = key->port_u.ports.port1;
911 } else {
912 *sport = key->port_u.ports.port1;
913 *dport = key->port_u.ports.port2;
917 static int
918 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
921 if (s1->st_proto > s2->st_proto)
922 return (1);
923 if (s1->st_proto < s2->st_proto)
924 return (-1);
926 if (s1->st_addrs > s2->st_addrs)
927 return (1);
928 if (s1->st_addrs < s2->st_addrs)
929 return (-1);
931 if (s1->st_ports > s2->st_ports)
932 return (1);
933 if (s1->st_ports < s2->st_ports)
934 return (-1);
936 if (s1->st_swap == s2->st_swap ||
937 (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
938 return (0);
940 if (s1->st_swap > s2->st_swap)
941 return (1);
942 else
943 return (-1);
946 static int
947 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
950 if (t1->tc_proto > t2->tc_proto)
951 return (1);
952 if (t1->tc_proto < t2->tc_proto)
953 return (-1);
955 if (t1->tc_addrs > t2->tc_addrs)
956 return (1);
957 if (t1->tc_addrs < t2->tc_addrs)
958 return (-1);
960 if (t1->tc_ports > t2->tc_ports)
961 return (1);
962 if (t1->tc_ports < t2->tc_ports)
963 return (-1);
965 if (t1->tc_ruleid > t2->tc_ruleid)
966 return (1);
967 if (t1->tc_ruleid < t2->tc_ruleid)
968 return (-1);
970 return (0);
973 static int
974 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
977 if (t1->t_proto > t2->t_proto)
978 return (1);
979 if (t1->t_proto < t2->t_proto)
980 return (-1);
982 if (t1->t_addrs > t2->t_addrs)
983 return (1);
984 if (t1->t_addrs < t2->t_addrs)
985 return (-1);
987 if (t1->t_ports > t2->t_ports)
988 return (1);
989 if (t1->t_ports < t2->t_ports)
990 return (-1);
992 if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
993 return (1);
994 if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
995 return (-1);
997 return (0);
1000 static void
1001 ipfw_state_max_set(int state_max)
1004 ipfw_state_max = state_max;
1005 /* Allow 5% states over-allocation. */
1006 ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1009 static __inline int
1010 ipfw_state_cntcoll(void)
1012 int cpu, state_cnt = 0;
1014 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1015 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1016 return (state_cnt);
1019 static __inline int
1020 ipfw_state_cntsync(void)
1022 int state_cnt;
1024 state_cnt = ipfw_state_cntcoll();
1025 ipfw_gd.ipfw_state_loosecnt = state_cnt;
1026 return (state_cnt);
1029 static __inline int
1030 ipfw_free_rule(struct ip_fw *rule)
1032 KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1033 KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1034 rule->refcnt--;
1035 if (rule->refcnt == 0) {
1036 if (rule->cross_rules != NULL)
1037 kfree(rule->cross_rules, M_IPFW);
1038 kfree(rule, M_IPFW);
1039 return 1;
1041 return 0;
1044 static void
1045 ipfw_unref_rule(void *priv)
1047 ipfw_free_rule(priv);
1048 #ifdef KLD_MODULE
1049 KASSERT(ipfw_gd.ipfw_refcnt > 0,
1050 ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1051 atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1052 #endif
1055 static __inline void
1056 ipfw_ref_rule(struct ip_fw *rule)
1058 KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1059 #ifdef KLD_MODULE
1060 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1061 #endif
1062 rule->refcnt++;
1066 * This macro maps an ip pointer into a layer3 header pointer of type T
1068 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1070 static __inline int
1071 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1073 int type = L3HDR(struct icmp,ip)->icmp_type;
1075 return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
1078 #define TT ((1 << ICMP_ECHO) | \
1079 (1 << ICMP_ROUTERSOLICIT) | \
1080 (1 << ICMP_TSTAMP) | \
1081 (1 << ICMP_IREQ) | \
1082 (1 << ICMP_MASKREQ))
1084 static int
1085 is_icmp_query(struct ip *ip)
1087 int type = L3HDR(struct icmp, ip)->icmp_type;
1089 return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
1092 #undef TT
1095 * The following checks use two arrays of 8 or 16 bits to store the
1096 * bits that we want set or clear, respectively. They are in the
1097 * low and high half of cmd->arg1 or cmd->d[0].
1099 * We scan options and store the bits we find set. We succeed if
1101 * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1103 * The code is sometimes optimized not to store additional variables.
1105 static int
1106 flags_match(ipfw_insn *cmd, uint8_t bits)
1108 u_char want_clear;
1109 bits = ~bits;
1111 if (((cmd->arg1 & 0xff) & bits) != 0)
1112 return 0; /* some bits we want set were clear */
1114 want_clear = (cmd->arg1 >> 8) & 0xff;
1115 if ((want_clear & bits) != want_clear)
1116 return 0; /* some bits we want clear were set */
1117 return 1;
1120 static int
1121 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1123 int optlen, bits = 0;
1124 u_char *cp = (u_char *)(ip + 1);
1125 int x = (ip->ip_hl << 2) - sizeof(struct ip);
1127 for (; x > 0; x -= optlen, cp += optlen) {
1128 int opt = cp[IPOPT_OPTVAL];
1130 if (opt == IPOPT_EOL)
1131 break;
1133 if (opt == IPOPT_NOP) {
1134 optlen = 1;
1135 } else {
1136 optlen = cp[IPOPT_OLEN];
1137 if (optlen <= 0 || optlen > x)
1138 return 0; /* invalid or truncated */
1141 switch (opt) {
1142 case IPOPT_LSRR:
1143 bits |= IP_FW_IPOPT_LSRR;
1144 break;
1146 case IPOPT_SSRR:
1147 bits |= IP_FW_IPOPT_SSRR;
1148 break;
1150 case IPOPT_RR:
1151 bits |= IP_FW_IPOPT_RR;
1152 break;
1154 case IPOPT_TS:
1155 bits |= IP_FW_IPOPT_TS;
1156 break;
1158 default:
1159 break;
1162 return (flags_match(cmd, bits));
1165 static int
1166 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1168 int optlen, bits = 0;
1169 struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1170 u_char *cp = (u_char *)(tcp + 1);
1171 int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1173 for (; x > 0; x -= optlen, cp += optlen) {
1174 int opt = cp[0];
1176 if (opt == TCPOPT_EOL)
1177 break;
1179 if (opt == TCPOPT_NOP) {
1180 optlen = 1;
1181 } else {
1182 optlen = cp[1];
1183 if (optlen <= 0)
1184 break;
1187 switch (opt) {
1188 case TCPOPT_MAXSEG:
1189 bits |= IP_FW_TCPOPT_MSS;
1190 break;
1192 case TCPOPT_WINDOW:
1193 bits |= IP_FW_TCPOPT_WINDOW;
1194 break;
1196 case TCPOPT_SACK_PERMITTED:
1197 case TCPOPT_SACK:
1198 bits |= IP_FW_TCPOPT_SACK;
1199 break;
1201 case TCPOPT_TIMESTAMP:
1202 bits |= IP_FW_TCPOPT_TS;
1203 break;
1205 case TCPOPT_CC:
1206 case TCPOPT_CCNEW:
1207 case TCPOPT_CCECHO:
1208 bits |= IP_FW_TCPOPT_CC;
1209 break;
1211 default:
1212 break;
1215 return (flags_match(cmd, bits));
1218 static int
1219 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1221 if (ifp == NULL) /* no iface with this packet, match fails */
1222 return 0;
1224 /* Check by name or by IP address */
1225 if (cmd->name[0] != '\0') { /* match by name */
1226 /* Check name */
1227 if (cmd->p.glob) {
1228 if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1229 return(1);
1230 } else {
1231 if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1232 return(1);
1234 } else {
1235 struct ifaddr_container *ifac;
1237 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1238 struct ifaddr *ia = ifac->ifa;
1240 if (ia->ifa_addr == NULL)
1241 continue;
1242 if (ia->ifa_addr->sa_family != AF_INET)
1243 continue;
1244 if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1245 (ia->ifa_addr))->sin_addr.s_addr)
1246 return(1); /* match */
1249 return(0); /* no match, fail ... */
1252 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1255 * We enter here when we have a rule with O_LOG.
1256 * XXX this function alone takes about 2Kbytes of code!
1258 static void
1259 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1260 struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1262 char *action;
1263 int limit_reached = 0;
1264 char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1266 fragment[0] = '\0';
1267 proto[0] = '\0';
1269 if (f == NULL) { /* bogus pkt */
1270 if (verbose_limit != 0 &&
1271 ctx->ipfw_norule_counter >= verbose_limit)
1272 return;
1273 ctx->ipfw_norule_counter++;
1274 if (ctx->ipfw_norule_counter == verbose_limit)
1275 limit_reached = verbose_limit;
1276 action = "Refuse";
1277 } else { /* O_LOG is the first action, find the real one */
1278 ipfw_insn *cmd = ACTION_PTR(f);
1279 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1281 if (l->max_log != 0 && l->log_left == 0)
1282 return;
1283 l->log_left--;
1284 if (l->log_left == 0)
1285 limit_reached = l->max_log;
1286 cmd += F_LEN(cmd); /* point to first action */
1287 if (cmd->opcode == O_PROB)
1288 cmd += F_LEN(cmd);
1290 action = action2;
1291 switch (cmd->opcode) {
1292 case O_DENY:
1293 action = "Deny";
1294 break;
1296 case O_REJECT:
1297 if (cmd->arg1==ICMP_REJECT_RST) {
1298 action = "Reset";
1299 } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1300 action = "Reject";
1301 } else {
1302 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1303 cmd->arg1);
1305 break;
1307 case O_ACCEPT:
1308 action = "Accept";
1309 break;
1311 case O_COUNT:
1312 action = "Count";
1313 break;
1315 case O_DIVERT:
1316 ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1317 break;
1319 case O_TEE:
1320 ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1321 break;
1323 case O_SKIPTO:
1324 ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1325 break;
1327 case O_PIPE:
1328 ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1329 break;
1331 case O_QUEUE:
1332 ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1333 break;
1335 case O_FORWARD_IP:
1337 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1338 int len;
1340 len = ksnprintf(SNPARGS(action2, 0),
1341 "Forward to %s",
1342 kinet_ntoa(sa->sa.sin_addr, abuf));
1343 if (sa->sa.sin_port) {
1344 ksnprintf(SNPARGS(action2, len), ":%d",
1345 sa->sa.sin_port);
1348 break;
1350 default:
1351 action = "UNKNOWN";
1352 break;
1356 if (hlen == 0) { /* non-ip */
1357 ksnprintf(SNPARGS(proto, 0), "MAC");
1358 } else {
1359 struct ip *ip = mtod(m, struct ip *);
1360 /* these three are all aliases to the same thing */
1361 struct icmp *const icmp = L3HDR(struct icmp, ip);
1362 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1363 struct udphdr *const udp = (struct udphdr *)icmp;
1365 int ip_off, offset, ip_len;
1366 int len;
1368 if (eh != NULL) { /* layer 2 packets are as on the wire */
1369 ip_off = ntohs(ip->ip_off);
1370 ip_len = ntohs(ip->ip_len);
1371 } else {
1372 ip_off = ip->ip_off;
1373 ip_len = ip->ip_len;
1375 offset = ip_off & IP_OFFMASK;
1376 switch (ip->ip_p) {
1377 case IPPROTO_TCP:
1378 len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1379 kinet_ntoa(ip->ip_src, abuf));
1380 if (offset == 0) {
1381 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1382 ntohs(tcp->th_sport),
1383 kinet_ntoa(ip->ip_dst, abuf),
1384 ntohs(tcp->th_dport));
1385 } else {
1386 ksnprintf(SNPARGS(proto, len), " %s",
1387 kinet_ntoa(ip->ip_dst, abuf));
1389 break;
1391 case IPPROTO_UDP:
1392 len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1393 kinet_ntoa(ip->ip_src, abuf));
1394 if (offset == 0) {
1395 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1396 ntohs(udp->uh_sport),
1397 kinet_ntoa(ip->ip_dst, abuf),
1398 ntohs(udp->uh_dport));
1399 } else {
1400 ksnprintf(SNPARGS(proto, len), " %s",
1401 kinet_ntoa(ip->ip_dst, abuf));
1403 break;
1405 case IPPROTO_ICMP:
1406 if (offset == 0) {
1407 len = ksnprintf(SNPARGS(proto, 0),
1408 "ICMP:%u.%u ",
1409 icmp->icmp_type,
1410 icmp->icmp_code);
1411 } else {
1412 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1414 len += ksnprintf(SNPARGS(proto, len), "%s",
1415 kinet_ntoa(ip->ip_src, abuf));
1416 ksnprintf(SNPARGS(proto, len), " %s",
1417 kinet_ntoa(ip->ip_dst, abuf));
1418 break;
1420 default:
1421 len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1422 kinet_ntoa(ip->ip_src, abuf));
1423 ksnprintf(SNPARGS(proto, len), " %s",
1424 kinet_ntoa(ip->ip_dst, abuf));
1425 break;
1428 if (ip_off & (IP_MF | IP_OFFMASK)) {
1429 ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1430 ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1431 offset << 3, (ip_off & IP_MF) ? "+" : "");
1435 if (oif || m->m_pkthdr.rcvif) {
1436 log(LOG_SECURITY | LOG_INFO,
1437 "ipfw: %d %s %s %s via %s%s\n",
1438 f ? f->rulenum : -1,
1439 action, proto, oif ? "out" : "in",
1440 oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1441 fragment);
1442 } else {
1443 log(LOG_SECURITY | LOG_INFO,
1444 "ipfw: %d %s %s [no if info]%s\n",
1445 f ? f->rulenum : -1,
1446 action, proto, fragment);
1449 if (limit_reached) {
1450 log(LOG_SECURITY | LOG_NOTICE,
1451 "ipfw: limit %d reached on entry %d\n",
1452 limit_reached, f ? f->rulenum : -1);
1456 #undef SNPARGS
1458 #define TIME_LEQ(a, b) ((a) - (b) <= 0)
1460 static void
1461 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1464 KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT,
1465 ("invalid state type %u", s->st_type));
1466 KASSERT(ctx->ipfw_state_cnt > 0,
1467 ("invalid state count %d", ctx->ipfw_state_cnt));
1469 if (s->st_track != NULL) {
1470 struct ipfw_track *t = s->st_track;
1472 KASSERT(!LIST_EMPTY(&t->t_state_list),
1473 ("track state list is empty"));
1474 LIST_REMOVE(s, st_trklink);
1476 KASSERT(*t->t_count > 0,
1477 ("invalid track count %d", *t->t_count));
1478 atomic_subtract_int(t->t_count, 1);
1481 TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1482 RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1483 kfree(s, M_IPFW);
1485 ctx->ipfw_state_cnt--;
1486 if (ctx->ipfw_state_loosecnt > 0)
1487 ctx->ipfw_state_loosecnt--;
1490 static int
1491 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1493 struct ipfw_state *s, *anchor;
1494 int expired;
1496 if (reap_max < ipfw_state_reap_min)
1497 reap_max = ipfw_state_reap_min;
1499 if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1501 * Kick start state expiring. Ignore scan limit,
1502 * we are short of states.
1504 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1505 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1506 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1507 return (expired);
1511 * States are being expired.
1514 if (ctx->ipfw_state_cnt == 0)
1515 return (0);
1517 expired = 0;
1518 anchor = &ctx->ipfw_stateexp_anch;
1519 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1521 * Ignore scan limit; we are short of states.
1524 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1525 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1527 if (s->st_type == O_ANCHOR)
1528 continue;
1530 if (IPFW_STATE_TCPCLOSED(s) ||
1531 TIME_LEQ(s->st_expire, time_uptime)) {
1532 ipfw_state_del(ctx, s);
1533 if (++expired >= reap_max)
1534 break;
1535 if ((expired & 0xff) == 0 &&
1536 ipfw_state_cntcoll() + ipfw_state_headroom <=
1537 ipfw_state_max)
1538 break;
1542 * NOTE:
1543 * Leave the anchor on the list, even if the end of the list has
1544 * been reached. ipfw_state_expire_more_dispatch() will handle
1545 * the removal.
1547 return (expired);
1550 static void
1551 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1553 struct ipfw_state *s, *sn;
1555 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1556 if (s->st_type == O_ANCHOR)
1557 continue;
1558 if (rule != NULL && s->st_rule != rule)
1559 continue;
1560 ipfw_state_del(ctx, s);
1564 static void
1565 ipfw_state_expire_done(struct ipfw_context *ctx)
1568 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1569 ("stateexp is not in progress"));
1570 ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1571 callout_reset(&ctx->ipfw_stateto_ch, hz,
1572 ipfw_state_expire_ipifunc, NULL);
1575 static void
1576 ipfw_state_expire_more(struct ipfw_context *ctx)
1578 struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1580 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1581 ("stateexp is not in progress"));
1582 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1583 ("stateexp more did not finish"));
1584 netisr_sendmsg_oncpu(nm);
1587 static int
1588 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1589 int scan_max, int expire_max)
1591 struct ipfw_state *s;
1592 int scanned = 0, expired = 0;
1594 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1595 ("stateexp is not in progress"));
1597 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1598 if (scanned++ >= scan_max) {
1599 ipfw_state_expire_more(ctx);
1600 return (expired);
1603 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1604 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1606 if (s->st_type == O_ANCHOR)
1607 continue;
1609 if (TIME_LEQ(s->st_expire, time_uptime) ||
1610 ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1611 IPFW_STATE_TCPCLOSED(s))) {
1612 ipfw_state_del(ctx, s);
1613 if (++expired >= expire_max) {
1614 ipfw_state_expire_more(ctx);
1615 return (expired);
1617 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1618 (expired & 0xff) == 0 &&
1619 ipfw_state_cntcoll() + ipfw_state_headroom <=
1620 ipfw_state_max) {
1621 ipfw_state_expire_more(ctx);
1622 return (expired);
1626 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1627 ipfw_state_expire_done(ctx);
1628 return (expired);
1631 static void
1632 ipfw_state_expire_more_dispatch(netmsg_t nm)
1634 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1635 struct ipfw_state *anchor;
1637 ASSERT_NETISR_NCPUS(mycpuid);
1638 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1639 ("statexp is not in progress"));
1641 /* Reply ASAP */
1642 netisr_replymsg(&nm->base, 0);
1644 anchor = &ctx->ipfw_stateexp_anch;
1645 if (ctx->ipfw_state_cnt == 0) {
1646 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1647 ipfw_state_expire_done(ctx);
1648 return;
1650 ipfw_state_expire_loop(ctx, anchor,
1651 ipfw_state_scan_max, ipfw_state_expire_max);
1654 static int
1655 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1657 struct ipfw_state *anchor;
1659 KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1660 ("stateexp is in progress"));
1661 ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1663 if (ctx->ipfw_state_cnt == 0) {
1664 ipfw_state_expire_done(ctx);
1665 return (0);
1669 * Do not expire more than once per second, it is useless.
1671 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1672 ctx->ipfw_state_lastexp == time_uptime) {
1673 ipfw_state_expire_done(ctx);
1674 return (0);
1676 ctx->ipfw_state_lastexp = time_uptime;
1678 anchor = &ctx->ipfw_stateexp_anch;
1679 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1680 return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1683 static void
1684 ipfw_state_expire_dispatch(netmsg_t nm)
1686 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1688 ASSERT_NETISR_NCPUS(mycpuid);
1690 /* Reply ASAP */
1691 crit_enter();
1692 netisr_replymsg(&nm->base, 0);
1693 crit_exit();
1695 if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1696 /* Running; done. */
1697 return;
1699 ipfw_state_expire_start(ctx,
1700 ipfw_state_scan_max, ipfw_state_expire_max);
1703 static void
1704 ipfw_state_expire_ipifunc(void *dummy __unused)
1706 struct netmsg_base *msg;
1708 KKASSERT(mycpuid < netisr_ncpus);
1709 msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
1711 crit_enter();
1712 if (msg->lmsg.ms_flags & MSGF_DONE)
1713 netisr_sendmsg_oncpu(msg);
1714 crit_exit();
1717 static boolean_t
1718 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
1720 uint32_t seq = ntohl(tcp->th_seq);
1721 uint32_t ack = ntohl(tcp->th_ack);
1723 if (tcp->th_flags & TH_RST)
1724 return (TRUE);
1726 if (dir == MATCH_FORWARD) {
1727 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
1728 s->st_flags |= IPFW_STATE_F_SEQFWD;
1729 s->st_seq_fwd = seq;
1730 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
1731 s->st_seq_fwd = seq;
1732 } else {
1733 /* Out-of-sequence; done. */
1734 return (FALSE);
1736 if (tcp->th_flags & TH_ACK) {
1737 if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
1738 s->st_flags |= IPFW_STATE_F_ACKFWD;
1739 s->st_ack_fwd = ack;
1740 } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
1741 s->st_ack_fwd = ack;
1742 } else {
1743 /* Out-of-sequence; done. */
1744 return (FALSE);
1747 if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
1748 (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
1749 s->st_state |= (TH_ACK << 8);
1751 } else {
1752 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
1753 s->st_flags |= IPFW_STATE_F_SEQREV;
1754 s->st_seq_rev = seq;
1755 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
1756 s->st_seq_rev = seq;
1757 } else {
1758 /* Out-of-sequence; done. */
1759 return (FALSE);
1761 if (tcp->th_flags & TH_ACK) {
1762 if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
1763 s->st_flags |= IPFW_STATE_F_ACKREV;
1764 s->st_ack_rev= ack;
1765 } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
1766 s->st_ack_rev = ack;
1767 } else {
1768 /* Out-of-sequence; done. */
1769 return (FALSE);
1772 if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
1773 s->st_ack_rev == s->st_seq_fwd + 1)
1774 s->st_state |= TH_ACK;
1777 return (TRUE);
1780 static void
1781 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
1782 const struct tcphdr *tcp, struct ipfw_state *s)
1785 if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
1786 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
1788 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
1789 return;
1791 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
1792 switch (s->st_state & IPFW_STATE_TCPSTATES) {
1793 case TH_SYN: /* opening */
1794 s->st_expire = time_uptime + dyn_syn_lifetime;
1795 break;
1797 case BOTH_SYN: /* move to established */
1798 case BOTH_SYN | TH_FIN: /* one side tries to close */
1799 case BOTH_SYN | (TH_FIN << 8):
1800 s->st_expire = time_uptime + dyn_ack_lifetime;
1801 break;
1803 case BOTH_SYN | BOTH_FIN: /* both sides closed */
1804 if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
1805 /* And both FINs were ACKed. */
1806 s->st_expire = time_uptime + dyn_fin_lifetime;
1807 } else {
1808 s->st_expire = time_uptime +
1809 dyn_finwait_lifetime;
1811 break;
1813 default:
1814 #if 0
1816 * reset or some invalid combination, but can also
1817 * occur if we use keep-state the wrong way.
1819 if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
1820 kprintf("invalid state: 0x%x\n", s->st_state);
1821 #endif
1822 s->st_expire = time_uptime + dyn_rst_lifetime;
1823 break;
1825 } else if (pkt->proto == IPPROTO_UDP) {
1826 s->st_expire = time_uptime + dyn_udp_lifetime;
1827 } else {
1828 /* other protocols */
1829 s->st_expire = time_uptime + dyn_short_lifetime;
1834 * Lookup a state.
1836 static struct ipfw_state *
1837 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1838 int *match_direction, const struct tcphdr *tcp)
1840 struct ipfw_state *key, *s;
1841 int dir = MATCH_NONE;
1843 key = &ctx->ipfw_state_tmpkey;
1844 ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
1845 pkt->dst_ip, pkt->dst_port, pkt->proto);
1846 s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
1847 if (s == NULL)
1848 goto done; /* not found. */
1849 if (TIME_LEQ(s->st_expire, time_uptime)) {
1850 /* Expired. */
1851 ipfw_state_del(ctx, s);
1852 s = NULL;
1853 goto done;
1855 if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
1856 /* TCP ports recycling is too fast. */
1857 ctx->ipfw_sts_tcprecycled++;
1858 ipfw_state_del(ctx, s);
1859 s = NULL;
1860 goto done;
1863 if (s->st_swap == key->st_swap) {
1864 dir = MATCH_FORWARD;
1865 } else {
1866 KASSERT((s->st_swap & key->st_swap) == 0,
1867 ("found mismatch state"));
1868 dir = MATCH_REVERSE;
1871 /* Update this state. */
1872 ipfw_state_update(pkt, dir, tcp, s);
1874 if (s->st_track != NULL) {
1875 /* This track has been used. */
1876 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
1878 done:
1879 if (match_direction)
1880 *match_direction = dir;
1881 return (s);
1884 static __inline struct ip_fw *
1885 ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1886 int *match_direction, const struct tcphdr *tcp, uint16_t len)
1888 struct ipfw_state *s;
1890 s = ipfw_state_lookup(ctx, pkt, match_direction, tcp);
1891 if (s == NULL)
1892 return (NULL);
1894 KASSERT(s->st_rule->cpuid == mycpuid,
1895 ("rule %p (cpu%d) does not belong to the current cpu%d",
1896 s->st_rule, s->st_rule->cpuid, mycpuid));
1898 s->st_pcnt++;
1899 s->st_bcnt += len;
1901 return (s->st_rule);
1904 static struct ipfw_state *
1905 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
1906 uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
1907 const struct tcphdr *tcp)
1909 struct ipfw_state *s, *dup;
1911 KASSERT(type == O_KEEP_STATE || type == O_LIMIT,
1912 ("invalid state type %u", type));
1914 s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
1915 if (s == NULL) {
1916 ctx->ipfw_sts_nomem++;
1917 return (NULL);
1920 ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
1921 id->dst_ip, id->dst_port, id->proto);
1923 s->st_rule = rule;
1924 s->st_type = type;
1926 ctx->ipfw_state_cnt++;
1927 ctx->ipfw_state_loosecnt++;
1928 if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
1929 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
1930 ctx->ipfw_state_loosecnt = 0;
1933 dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1934 if (dup != NULL)
1935 panic("ipfw: state exists");
1936 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1939 * Update this state:
1940 * Set st_expire and st_state.
1942 ipfw_state_update(id, MATCH_FORWARD, tcp, s);
1944 if (t != NULL) {
1945 /* Keep the track referenced. */
1946 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
1947 s->st_track = t;
1949 return (s);
1952 static boolean_t
1953 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
1955 struct ipfw_trkcnt *trk;
1956 boolean_t trk_freed = FALSE;
1958 KASSERT(t->t_count != NULL, ("track anchor"));
1959 KASSERT(LIST_EMPTY(&t->t_state_list),
1960 ("invalid track is still referenced"));
1962 trk = t->t_trkcnt;
1963 KASSERT(trk != NULL, ("track has no trkcnt"));
1965 RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
1966 TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
1967 kfree(t, M_IPFW);
1970 * fdrop() style reference counting.
1971 * See kern/kern_descrip.c fdrop().
1973 for (;;) {
1974 int refs = trk->tc_refs;
1976 cpu_ccfence();
1977 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
1978 if (refs == 1) {
1979 IPFW_TRKCNT_TOKGET;
1980 if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
1981 KASSERT(trk->tc_count == 0,
1982 ("%d states reference this trkcnt",
1983 trk->tc_count));
1984 RB_REMOVE(ipfw_trkcnt_tree,
1985 &ipfw_gd.ipfw_trkcnt_tree, trk);
1987 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
1988 ("invalid trkcnt cnt %d",
1989 ipfw_gd.ipfw_trkcnt_cnt));
1990 ipfw_gd.ipfw_trkcnt_cnt--;
1991 IPFW_TRKCNT_TOKREL;
1993 if (ctx->ipfw_trkcnt_spare == NULL)
1994 ctx->ipfw_trkcnt_spare = trk;
1995 else
1996 kfree(trk, M_IPFW);
1997 trk_freed = TRUE;
1998 break; /* done! */
2000 IPFW_TRKCNT_TOKREL;
2001 /* retry */
2002 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2003 break; /* done! */
2005 /* retry */
2007 return (trk_freed);
2010 static void
2011 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2013 struct ipfw_track *t, *tn;
2015 TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2016 if (t->t_count == NULL) /* anchor */
2017 continue;
2018 if (rule != NULL && t->t_rule != rule)
2019 continue;
2020 ipfw_track_free(ctx, t);
2024 static boolean_t
2025 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2026 boolean_t reap)
2028 struct ipfw_state *s, *sn;
2029 boolean_t ret = FALSE;
2031 KASSERT(t->t_count != NULL, ("track anchor"));
2033 if (LIST_EMPTY(&t->t_state_list))
2034 return (FALSE);
2037 * Do not expire more than once per second, it is useless.
2039 if (t->t_lastexp == time_uptime)
2040 return (FALSE);
2041 t->t_lastexp = time_uptime;
2043 LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2044 if (TIME_LEQ(s->st_expire, time_uptime) ||
2045 (reap && IPFW_STATE_TCPCLOSED(s))) {
2046 KASSERT(s->st_track == t,
2047 ("state track %p does not match %p",
2048 s->st_track, t));
2049 ipfw_state_del(ctx, s);
2050 ret = TRUE;
2053 return (ret);
2056 static __inline struct ipfw_trkcnt *
2057 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2059 struct ipfw_trkcnt *trk;
2061 if (ctx->ipfw_trkcnt_spare != NULL) {
2062 trk = ctx->ipfw_trkcnt_spare;
2063 ctx->ipfw_trkcnt_spare = NULL;
2064 } else {
2065 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2066 M_INTWAIT | M_NULLOK);
2068 return (trk);
2071 static void
2072 ipfw_track_expire_done(struct ipfw_context *ctx)
2075 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2076 ("trackexp is not in progress"));
2077 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2078 callout_reset(&ctx->ipfw_trackto_ch, hz,
2079 ipfw_track_expire_ipifunc, NULL);
2082 static void
2083 ipfw_track_expire_more(struct ipfw_context *ctx)
2085 struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2087 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2088 ("trackexp is not in progress"));
2089 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2090 ("trackexp more did not finish"));
2091 netisr_sendmsg_oncpu(nm);
2094 static int
2095 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2096 int scan_max, int expire_max)
2098 struct ipfw_track *t;
2099 int scanned = 0, expired = 0;
2100 boolean_t reap = FALSE;
2102 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2103 ("trackexp is not in progress"));
2105 if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2106 reap = TRUE;
2108 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2109 if (scanned++ >= scan_max) {
2110 ipfw_track_expire_more(ctx);
2111 return (expired);
2114 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2115 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2117 if (t->t_count == NULL) /* anchor */
2118 continue;
2120 ipfw_track_state_expire(ctx, t, reap);
2121 if (!LIST_EMPTY(&t->t_state_list)) {
2122 /* There are states referencing this track. */
2123 continue;
2126 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2127 /* Expired. */
2128 if (ipfw_track_free(ctx, t)) {
2129 if (++expired >= expire_max) {
2130 ipfw_track_expire_more(ctx);
2131 return (expired);
2136 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2137 ipfw_track_expire_done(ctx);
2138 return (expired);
2141 static int
2142 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2144 struct ipfw_track *anchor;
2146 KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2147 ("trackexp is in progress"));
2148 ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2150 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2151 ipfw_track_expire_done(ctx);
2152 return (0);
2156 * Do not expire more than once per second, it is useless.
2158 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2159 ctx->ipfw_track_lastexp == time_uptime) {
2160 ipfw_track_expire_done(ctx);
2161 return (0);
2163 ctx->ipfw_track_lastexp = time_uptime;
2165 anchor = &ctx->ipfw_trackexp_anch;
2166 TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2167 return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2170 static void
2171 ipfw_track_expire_more_dispatch(netmsg_t nm)
2173 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2174 struct ipfw_track *anchor;
2176 ASSERT_NETISR_NCPUS(mycpuid);
2177 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2178 ("trackexp is not in progress"));
2180 /* Reply ASAP */
2181 netisr_replymsg(&nm->base, 0);
2183 anchor = &ctx->ipfw_trackexp_anch;
2184 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2185 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2186 ipfw_track_expire_done(ctx);
2187 return;
2189 ipfw_track_expire_loop(ctx, anchor,
2190 ipfw_track_scan_max, ipfw_track_expire_max);
2193 static void
2194 ipfw_track_expire_dispatch(netmsg_t nm)
2196 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2198 ASSERT_NETISR_NCPUS(mycpuid);
2200 /* Reply ASAP */
2201 crit_enter();
2202 netisr_replymsg(&nm->base, 0);
2203 crit_exit();
2205 if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2206 /* Running; done. */
2207 return;
2209 ipfw_track_expire_start(ctx,
2210 ipfw_track_scan_max, ipfw_track_expire_max);
2213 static void
2214 ipfw_track_expire_ipifunc(void *dummy __unused)
2216 struct netmsg_base *msg;
2218 KKASSERT(mycpuid < netisr_ncpus);
2219 msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2221 crit_enter();
2222 if (msg->lmsg.ms_flags & MSGF_DONE)
2223 netisr_sendmsg_oncpu(msg);
2224 crit_exit();
2227 static int
2228 ipfw_track_reap(struct ipfw_context *ctx)
2230 struct ipfw_track *t, *anchor;
2231 int expired;
2233 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2235 * Kick start track expiring. Ignore scan limit,
2236 * we are short of tracks.
2238 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2239 expired = ipfw_track_expire_start(ctx, INT_MAX,
2240 ipfw_track_reap_max);
2241 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2242 return (expired);
2246 * Tracks are being expired.
2249 if (RB_EMPTY(&ctx->ipfw_track_tree))
2250 return (0);
2252 expired = 0;
2253 anchor = &ctx->ipfw_trackexp_anch;
2254 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2256 * Ignore scan limit; we are short of tracks.
2259 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2260 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2262 if (t->t_count == NULL) /* anchor */
2263 continue;
2265 ipfw_track_state_expire(ctx, t, TRUE);
2266 if (!LIST_EMPTY(&t->t_state_list)) {
2267 /* There are states referencing this track. */
2268 continue;
2271 if (ipfw_track_free(ctx, t)) {
2272 if (++expired >= ipfw_track_reap_max) {
2273 ipfw_track_expire_more(ctx);
2274 break;
2279 * NOTE:
2280 * Leave the anchor on the list, even if the end of the list has
2281 * been reached. ipfw_track_expire_more_dispatch() will handle
2282 * the removal.
2284 return (expired);
2287 static struct ipfw_track *
2288 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2289 uint16_t limit_mask, struct ip_fw *rule)
2291 struct ipfw_track *key, *t, *dup;
2292 struct ipfw_trkcnt *trk, *ret;
2293 boolean_t do_expire = FALSE;
2295 KASSERT(rule->track_ruleid != 0,
2296 ("rule %u has no track ruleid", rule->rulenum));
2298 key = &ctx->ipfw_track_tmpkey;
2299 key->t_proto = id->proto;
2300 key->t_addrs = 0;
2301 key->t_ports = 0;
2302 key->t_rule = rule;
2303 if (limit_mask & DYN_SRC_ADDR)
2304 key->t_saddr = id->src_ip;
2305 if (limit_mask & DYN_DST_ADDR)
2306 key->t_daddr = id->dst_ip;
2307 if (limit_mask & DYN_SRC_PORT)
2308 key->t_sport = id->src_port;
2309 if (limit_mask & DYN_DST_PORT)
2310 key->t_dport = id->dst_port;
2312 t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2313 if (t != NULL)
2314 goto done;
2316 t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2317 if (t == NULL) {
2318 ctx->ipfw_tks_nomem++;
2319 return (NULL);
2322 t->t_key = key->t_key;
2323 t->t_rule = rule;
2324 t->t_lastexp = 0;
2325 LIST_INIT(&t->t_state_list);
2327 if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2328 time_t globexp, uptime;
2330 trk = NULL;
2331 do_expire = TRUE;
2334 * Do not expire globally more than once per second,
2335 * it is useless.
2337 uptime = time_uptime;
2338 globexp = ipfw_gd.ipfw_track_globexp;
2339 if (globexp != uptime &&
2340 atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2341 globexp, uptime)) {
2342 int cpu;
2344 /* Expire tracks on other CPUs. */
2345 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2346 if (cpu == mycpuid)
2347 continue;
2348 lwkt_send_ipiq(globaldata_find(cpu),
2349 ipfw_track_expire_ipifunc, NULL);
2352 } else {
2353 trk = ipfw_trkcnt_alloc(ctx);
2355 if (trk == NULL) {
2356 struct ipfw_trkcnt *tkey;
2358 tkey = &ctx->ipfw_trkcnt_tmpkey;
2359 key = NULL; /* tkey overlaps key */
2361 tkey->tc_key = t->t_key;
2362 tkey->tc_ruleid = rule->track_ruleid;
2364 IPFW_TRKCNT_TOKGET;
2365 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2366 tkey);
2367 if (trk == NULL) {
2368 IPFW_TRKCNT_TOKREL;
2369 if (do_expire) {
2370 ctx->ipfw_tks_reap++;
2371 if (ipfw_track_reap(ctx) > 0) {
2372 if (ipfw_gd.ipfw_trkcnt_cnt <
2373 ipfw_track_max) {
2374 trk = ipfw_trkcnt_alloc(ctx);
2375 if (trk != NULL)
2376 goto install;
2377 ctx->ipfw_tks_cntnomem++;
2378 } else {
2379 ctx->ipfw_tks_overflow++;
2381 } else {
2382 ctx->ipfw_tks_reapfailed++;
2383 ctx->ipfw_tks_overflow++;
2385 } else {
2386 ctx->ipfw_tks_cntnomem++;
2388 kfree(t, M_IPFW);
2389 return (NULL);
2391 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2392 ("invalid trkcnt refs %d", trk->tc_refs));
2393 atomic_add_int(&trk->tc_refs, 1);
2394 IPFW_TRKCNT_TOKREL;
2395 } else {
2396 install:
2397 trk->tc_key = t->t_key;
2398 trk->tc_ruleid = rule->track_ruleid;
2399 trk->tc_refs = 0;
2400 trk->tc_count = 0;
2401 trk->tc_expire = 0;
2402 trk->tc_rulenum = rule->rulenum;
2404 IPFW_TRKCNT_TOKGET;
2405 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2406 trk);
2407 if (ret != NULL) {
2408 KASSERT(ret->tc_refs > 0 &&
2409 ret->tc_refs < netisr_ncpus,
2410 ("invalid trkcnt refs %d", ret->tc_refs));
2411 KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2412 ("trkcnt spare was installed"));
2413 ctx->ipfw_trkcnt_spare = trk;
2414 trk = ret;
2415 } else {
2416 ipfw_gd.ipfw_trkcnt_cnt++;
2418 atomic_add_int(&trk->tc_refs, 1);
2419 IPFW_TRKCNT_TOKREL;
2421 t->t_count = &trk->tc_count;
2422 t->t_trkcnt = trk;
2424 dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2425 if (dup != NULL)
2426 panic("ipfw: track exists");
2427 TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2428 done:
2429 t->t_expire = time_uptime + dyn_short_lifetime;
2430 return (t);
2434 * Install state for rule type cmd->o.opcode
2436 * Returns 1 (failure) if state is not installed because of errors or because
2437 * states limitations are enforced.
2439 static int
2440 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2441 ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2443 struct ipfw_state *s;
2444 struct ipfw_track *t;
2445 int count, diff;
2447 if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2448 (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2449 boolean_t overflow = TRUE;
2451 ctx->ipfw_sts_reap++;
2452 if (ipfw_state_reap(ctx, diff) == 0)
2453 ctx->ipfw_sts_reapfailed++;
2454 if (ipfw_state_cntsync() < ipfw_state_max)
2455 overflow = FALSE;
2457 if (overflow) {
2458 time_t globexp, uptime;
2459 int cpu;
2462 * Do not expire globally more than once per second,
2463 * it is useless.
2465 uptime = time_uptime;
2466 globexp = ipfw_gd.ipfw_state_globexp;
2467 if (globexp == uptime ||
2468 !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2469 globexp, uptime)) {
2470 ctx->ipfw_sts_overflow++;
2471 return (1);
2474 /* Expire states on other CPUs. */
2475 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2476 if (cpu == mycpuid)
2477 continue;
2478 lwkt_send_ipiq(globaldata_find(cpu),
2479 ipfw_state_expire_ipifunc, NULL);
2481 ctx->ipfw_sts_overflow++;
2482 return (1);
2486 switch (cmd->o.opcode) {
2487 case O_KEEP_STATE: /* bidir rule */
2488 s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL,
2489 tcp);
2490 if (s == NULL)
2491 return (1);
2492 break;
2494 case O_LIMIT: /* limit number of sessions */
2495 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2496 if (t == NULL)
2497 return (1);
2499 if (*t->t_count >= cmd->conn_limit) {
2500 if (!ipfw_track_state_expire(ctx, t, TRUE))
2501 return (1);
2503 for (;;) {
2504 count = *t->t_count;
2505 if (count >= cmd->conn_limit)
2506 return (1);
2507 if (atomic_cmpset_int(t->t_count, count, count + 1))
2508 break;
2511 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2512 if (s == NULL) {
2513 /* Undo damage. */
2514 atomic_subtract_int(t->t_count, 1);
2515 return (1);
2517 break;
2519 default:
2520 panic("unknown state type %u\n", cmd->o.opcode);
2522 return (0);
2525 static int
2526 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2527 const struct in_addr *in)
2529 struct radix_node_head *rnh;
2530 struct sockaddr_in sin;
2531 struct ipfw_tblent *te;
2533 KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2534 rnh = ctx->ipfw_tables[tableid];
2535 if (rnh == NULL)
2536 return (0); /* no match */
2538 memset(&sin, 0, sizeof(sin));
2539 sin.sin_family = AF_INET;
2540 sin.sin_len = sizeof(sin);
2541 sin.sin_addr = *in;
2543 te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2544 if (te == NULL)
2545 return (0); /* no match */
2547 te->te_use++;
2548 te->te_lastuse = time_second;
2549 return (1); /* match */
2553 * Transmit a TCP packet, containing either a RST or a keepalive.
2554 * When flags & TH_RST, we are sending a RST packet, because of a
2555 * "reset" action matched the packet.
2556 * Otherwise we are sending a keepalive, and flags & TH_
2558 * Only {src,dst}_{ip,port} of "id" are used.
2560 static void
2561 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2563 struct mbuf *m;
2564 struct ip *ip;
2565 struct tcphdr *tcp;
2566 struct route sro; /* fake route */
2568 MGETHDR(m, M_NOWAIT, MT_HEADER);
2569 if (m == NULL)
2570 return;
2571 m->m_pkthdr.rcvif = NULL;
2572 m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2573 m->m_data += max_linkhdr;
2575 ip = mtod(m, struct ip *);
2576 bzero(ip, m->m_len);
2577 tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2578 ip->ip_p = IPPROTO_TCP;
2579 tcp->th_off = 5;
2582 * Assume we are sending a RST (or a keepalive in the reverse
2583 * direction), swap src and destination addresses and ports.
2585 ip->ip_src.s_addr = htonl(id->dst_ip);
2586 ip->ip_dst.s_addr = htonl(id->src_ip);
2587 tcp->th_sport = htons(id->dst_port);
2588 tcp->th_dport = htons(id->src_port);
2589 if (flags & TH_RST) { /* we are sending a RST */
2590 if (flags & TH_ACK) {
2591 tcp->th_seq = htonl(ack);
2592 tcp->th_ack = htonl(0);
2593 tcp->th_flags = TH_RST;
2594 } else {
2595 if (flags & TH_SYN)
2596 seq++;
2597 tcp->th_seq = htonl(0);
2598 tcp->th_ack = htonl(seq);
2599 tcp->th_flags = TH_RST | TH_ACK;
2601 } else {
2603 * We are sending a keepalive. flags & TH_SYN determines
2604 * the direction, forward if set, reverse if clear.
2605 * NOTE: seq and ack are always assumed to be correct
2606 * as set by the caller. This may be confusing...
2608 if (flags & TH_SYN) {
2610 * we have to rewrite the correct addresses!
2612 ip->ip_dst.s_addr = htonl(id->dst_ip);
2613 ip->ip_src.s_addr = htonl(id->src_ip);
2614 tcp->th_dport = htons(id->dst_port);
2615 tcp->th_sport = htons(id->src_port);
2617 tcp->th_seq = htonl(seq);
2618 tcp->th_ack = htonl(ack);
2619 tcp->th_flags = TH_ACK;
2623 * set ip_len to the payload size so we can compute
2624 * the tcp checksum on the pseudoheader
2625 * XXX check this, could save a couple of words ?
2627 ip->ip_len = htons(sizeof(struct tcphdr));
2628 tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2631 * now fill fields left out earlier
2633 ip->ip_ttl = ip_defttl;
2634 ip->ip_len = m->m_pkthdr.len;
2636 bzero(&sro, sizeof(sro));
2637 ip_rtaddr(ip->ip_dst, &sro);
2639 m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2640 ip_output(m, NULL, &sro, 0, NULL, NULL);
2641 if (sro.ro_rt)
2642 RTFREE(sro.ro_rt);
2646 * Send a reject message, consuming the mbuf passed as an argument.
2648 static void
2649 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2651 if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2652 /* We need the IP header in host order for icmp_error(). */
2653 if (args->eh != NULL) {
2654 struct ip *ip = mtod(args->m, struct ip *);
2656 ip->ip_len = ntohs(ip->ip_len);
2657 ip->ip_off = ntohs(ip->ip_off);
2659 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2660 } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2661 struct tcphdr *const tcp =
2662 L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2664 if ((tcp->th_flags & TH_RST) == 0) {
2665 send_pkt(&args->f_id, ntohl(tcp->th_seq),
2666 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2668 m_freem(args->m);
2669 } else {
2670 m_freem(args->m);
2672 args->m = NULL;
2676 * Given an ip_fw *, lookup_next_rule will return a pointer
2677 * to the next rule, which can be either the jump
2678 * target (for skipto instructions) or the next one in the list (in
2679 * all other cases including a missing jump target).
2680 * The result is also written in the "next_rule" field of the rule.
2681 * Backward jumps are not allowed, so start looking from the next
2682 * rule...
2684 * This never returns NULL -- in case we do not have an exact match,
2685 * the next rule is returned. When the ruleset is changed,
2686 * pointers are flushed so we are always correct.
2688 static struct ip_fw *
2689 lookup_next_rule(struct ip_fw *me)
2691 struct ip_fw *rule = NULL;
2692 ipfw_insn *cmd;
2694 /* look for action, in case it is a skipto */
2695 cmd = ACTION_PTR(me);
2696 if (cmd->opcode == O_LOG)
2697 cmd += F_LEN(cmd);
2698 if (cmd->opcode == O_SKIPTO) {
2699 for (rule = me->next; rule; rule = rule->next) {
2700 if (rule->rulenum >= cmd->arg1)
2701 break;
2704 if (rule == NULL) /* failure or not a skipto */
2705 rule = me->next;
2706 me->next_rule = rule;
2707 return rule;
2710 static int
2711 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
2712 enum ipfw_opcodes opcode, uid_t uid)
2714 struct in_addr src_ip, dst_ip;
2715 struct inpcbinfo *pi;
2716 boolean_t wildcard;
2717 struct inpcb *pcb;
2719 if (fid->proto == IPPROTO_TCP) {
2720 wildcard = FALSE;
2721 pi = &tcbinfo[mycpuid];
2722 } else if (fid->proto == IPPROTO_UDP) {
2723 wildcard = TRUE;
2724 pi = &udbinfo[mycpuid];
2725 } else {
2726 return 0;
2730 * Values in 'fid' are in host byte order
2732 dst_ip.s_addr = htonl(fid->dst_ip);
2733 src_ip.s_addr = htonl(fid->src_ip);
2734 if (oif) {
2735 pcb = in_pcblookup_hash(pi,
2736 dst_ip, htons(fid->dst_port),
2737 src_ip, htons(fid->src_port),
2738 wildcard, oif);
2739 } else {
2740 pcb = in_pcblookup_hash(pi,
2741 src_ip, htons(fid->src_port),
2742 dst_ip, htons(fid->dst_port),
2743 wildcard, NULL);
2745 if (pcb == NULL || pcb->inp_socket == NULL)
2746 return 0;
2748 if (opcode == O_UID) {
2749 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
2750 return !socheckuid(pcb->inp_socket, uid);
2751 #undef socheckuid
2752 } else {
2753 return groupmember(uid, pcb->inp_socket->so_cred);
2757 static __inline int
2758 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
2761 if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
2762 struct ifaddr_container *ifac;
2763 struct ifnet *ifp;
2765 ifp = ifunit_netisr(cmd->ifname);
2766 if (ifp == NULL)
2767 return (0);
2769 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2770 struct ifaddr *ia = ifac->ifa;
2772 if (ia->ifa_addr == NULL)
2773 continue;
2774 if (ia->ifa_addr->sa_family != AF_INET)
2775 continue;
2777 cmd->mask.s_addr = INADDR_ANY;
2778 if (cmd->o.arg1 & IPFW_IFIP_NET) {
2779 cmd->mask = ((struct sockaddr_in *)
2780 ia->ifa_netmask)->sin_addr;
2782 if (cmd->mask.s_addr == INADDR_ANY)
2783 cmd->mask.s_addr = INADDR_BROADCAST;
2785 cmd->addr =
2786 ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
2787 cmd->addr.s_addr &= cmd->mask.s_addr;
2789 cmd->o.arg1 |= IPFW_IFIP_VALID;
2790 break;
2792 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
2793 return (0);
2795 return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
2798 static __inline struct mbuf *
2799 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
2800 struct ip_fw_local *local, struct ip **ip0)
2802 struct ip *ip = mtod(m, struct ip *);
2803 struct tcphdr *tcp;
2804 struct udphdr *udp;
2807 * Collect parameters into local variables for faster matching.
2809 if (hlen == 0) { /* do not grab addresses for non-ip pkts */
2810 local->proto = args->f_id.proto = 0; /* mark f_id invalid */
2811 goto done;
2814 local->proto = args->f_id.proto = ip->ip_p;
2815 local->src_ip = ip->ip_src;
2816 local->dst_ip = ip->ip_dst;
2817 if (args->eh != NULL) { /* layer 2 packets are as on the wire */
2818 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
2819 local->ip_len = ntohs(ip->ip_len);
2820 } else {
2821 local->offset = ip->ip_off & IP_OFFMASK;
2822 local->ip_len = ip->ip_len;
2825 #define PULLUP_TO(len) \
2826 do { \
2827 if (m->m_len < (len)) { \
2828 args->m = m = m_pullup(m, (len)); \
2829 if (m == NULL) { \
2830 ip = NULL; \
2831 goto done; \
2833 ip = mtod(m, struct ip *); \
2835 } while (0)
2837 if (local->offset == 0) {
2838 switch (local->proto) {
2839 case IPPROTO_TCP:
2840 PULLUP_TO(hlen + sizeof(struct tcphdr));
2841 tcp = L3HDR(struct tcphdr, ip);
2842 local->dst_port = tcp->th_dport;
2843 local->src_port = tcp->th_sport;
2844 args->f_id.flags = tcp->th_flags;
2845 break;
2847 case IPPROTO_UDP:
2848 PULLUP_TO(hlen + sizeof(struct udphdr));
2849 udp = L3HDR(struct udphdr, ip);
2850 local->dst_port = udp->uh_dport;
2851 local->src_port = udp->uh_sport;
2852 break;
2854 case IPPROTO_ICMP:
2855 PULLUP_TO(hlen + 4); /* type, code and checksum. */
2856 args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
2857 break;
2859 default:
2860 break;
2864 #undef PULLUP_TO
2866 args->f_id.src_ip = ntohl(local->src_ip.s_addr);
2867 args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
2868 args->f_id.src_port = local->src_port = ntohs(local->src_port);
2869 args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
2870 done:
2871 *ip0 = ip;
2872 return (m);
2876 * The main check routine for the firewall.
2878 * All arguments are in args so we can modify them and return them
2879 * back to the caller.
2881 * Parameters:
2883 * args->m (in/out) The packet; we set to NULL when/if we nuke it.
2884 * Starts with the IP header.
2885 * args->eh (in) Mac header if present, or NULL for layer3 packet.
2886 * args->oif Outgoing interface, or NULL if packet is incoming.
2887 * The incoming interface is in the mbuf. (in)
2889 * args->rule Pointer to the last matching rule (in/out)
2890 * args->f_id Addresses grabbed from the packet (out)
2892 * Return value:
2894 * If the packet was denied/rejected and has been dropped, *m is equal
2895 * to NULL upon return.
2897 * IP_FW_DENY the packet must be dropped.
2898 * IP_FW_PASS The packet is to be accepted and routed normally.
2899 * IP_FW_DIVERT Divert the packet to port (args->cookie)
2900 * IP_FW_TEE Tee the packet to port (args->cookie)
2901 * IP_FW_DUMMYNET Send the packet to pipe/queue (args->cookie)
2902 * IP_FW_CONTINUE Continue processing on another cpu.
2904 static int
2905 ipfw_chk(struct ip_fw_args *args)
2908 * Local variables hold state during the processing of a packet.
2910 * IMPORTANT NOTE: to speed up the processing of rules, there
2911 * are some assumption on the values of the variables, which
2912 * are documented here. Should you change them, please check
2913 * the implementation of the various instructions to make sure
2914 * that they still work.
2916 * args->eh The MAC header. It is non-null for a layer2
2917 * packet, it is NULL for a layer-3 packet.
2919 * m | args->m Pointer to the mbuf, as received from the caller.
2920 * It may change if ipfw_chk() does an m_pullup, or if it
2921 * consumes the packet because it calls send_reject().
2922 * XXX This has to change, so that ipfw_chk() never modifies
2923 * or consumes the buffer.
2924 * ip is simply an alias of the value of m, and it is kept
2925 * in sync with it (the packet is supposed to start with
2926 * the ip header).
2928 struct mbuf *m = args->m;
2929 struct ip *ip = mtod(m, struct ip *);
2932 * oif | args->oif If NULL, ipfw_chk has been called on the
2933 * inbound path (ether_input, ip_input).
2934 * If non-NULL, ipfw_chk has been called on the outbound path
2935 * (ether_output, ip_output).
2937 struct ifnet *oif = args->oif;
2939 struct ip_fw *f = NULL; /* matching rule */
2940 int retval = IP_FW_PASS;
2941 struct m_tag *mtag;
2942 struct divert_info *divinfo;
2945 * hlen The length of the IPv4 header.
2946 * hlen >0 means we have an IPv4 packet.
2948 u_int hlen = 0; /* hlen >0 means we have an IP pkt */
2950 struct ip_fw_local lc;
2953 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
2954 * MATCH_NONE when checked and not matched (dyn_f = NULL),
2955 * MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
2957 int dyn_dir = MATCH_UNKNOWN;
2958 struct ip_fw *dyn_f = NULL;
2959 int cpuid = mycpuid;
2960 struct ipfw_context *ctx;
2962 ASSERT_NETISR_NCPUS(cpuid);
2963 ctx = ipfw_ctx[cpuid];
2965 if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
2966 return IP_FW_PASS; /* accept */
2968 if (args->eh == NULL || /* layer 3 packet */
2969 (m->m_pkthdr.len >= sizeof(struct ip) &&
2970 ntohs(args->eh->ether_type) == ETHERTYPE_IP))
2971 hlen = ip->ip_hl << 2;
2973 memset(&lc, 0, sizeof(lc));
2975 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
2976 if (m == NULL)
2977 goto pullup_failed;
2979 if (args->rule) {
2981 * Packet has already been tagged. Look for the next rule
2982 * to restart processing.
2984 * If fw_one_pass != 0 then just accept it.
2985 * XXX should not happen here, but optimized out in
2986 * the caller.
2988 if (fw_one_pass && !args->cont)
2989 return IP_FW_PASS;
2990 args->cont = 0;
2992 /* This rule is being/has been flushed */
2993 if (ipfw_flushing)
2994 return IP_FW_DENY;
2996 KASSERT(args->rule->cpuid == cpuid,
2997 ("rule used on cpu%d", cpuid));
2999 /* This rule was deleted */
3000 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3001 return IP_FW_DENY;
3003 f = args->rule->next_rule;
3004 if (f == NULL)
3005 f = lookup_next_rule(args->rule);
3006 } else {
3008 * Find the starting rule. It can be either the first
3009 * one, or the one after divert_rule if asked so.
3011 int skipto;
3013 KKASSERT(!args->cont);
3015 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3016 if (mtag != NULL) {
3017 divinfo = m_tag_data(mtag);
3018 skipto = divinfo->skipto;
3019 } else {
3020 skipto = 0;
3023 f = ctx->ipfw_layer3_chain;
3024 if (args->eh == NULL && skipto != 0) {
3025 /* No skipto during rule flushing */
3026 if (ipfw_flushing)
3027 return IP_FW_DENY;
3029 if (skipto >= IPFW_DEFAULT_RULE)
3030 return IP_FW_DENY; /* invalid */
3032 while (f && f->rulenum <= skipto)
3033 f = f->next;
3034 if (f == NULL) /* drop packet */
3035 return IP_FW_DENY;
3036 } else if (ipfw_flushing) {
3037 /* Rules are being flushed; skip to default rule */
3038 f = ctx->ipfw_default_rule;
3041 if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3042 m_tag_delete(m, mtag);
3045 * Now scan the rules, and parse microinstructions for each rule.
3047 for (; f; f = f->next) {
3048 int l, cmdlen;
3049 ipfw_insn *cmd;
3050 int skip_or; /* skip rest of OR block */
3052 again:
3053 if (ctx->ipfw_set_disable & (1 << f->set))
3054 continue;
3056 skip_or = 0;
3057 for (l = f->cmd_len, cmd = f->cmd; l > 0;
3058 l -= cmdlen, cmd += cmdlen) {
3059 int match;
3062 * check_body is a jump target used when we find a
3063 * CHECK_STATE, and need to jump to the body of
3064 * the target rule.
3067 check_body:
3068 cmdlen = F_LEN(cmd);
3070 * An OR block (insn_1 || .. || insn_n) has the
3071 * F_OR bit set in all but the last instruction.
3072 * The first match will set "skip_or", and cause
3073 * the following instructions to be skipped until
3074 * past the one with the F_OR bit clear.
3076 if (skip_or) { /* skip this instruction */
3077 if ((cmd->len & F_OR) == 0)
3078 skip_or = 0; /* next one is good */
3079 continue;
3081 match = 0; /* set to 1 if we succeed */
3083 switch (cmd->opcode) {
3085 * The first set of opcodes compares the packet's
3086 * fields with some pattern, setting 'match' if a
3087 * match is found. At the end of the loop there is
3088 * logic to deal with F_NOT and F_OR flags associated
3089 * with the opcode.
3091 case O_NOP:
3092 match = 1;
3093 break;
3095 case O_FORWARD_MAC:
3096 kprintf("ipfw: opcode %d unimplemented\n",
3097 cmd->opcode);
3098 break;
3100 case O_GID:
3101 case O_UID:
3103 * We only check offset == 0 && proto != 0,
3104 * as this ensures that we have an IPv4
3105 * packet with the ports info.
3107 if (lc.offset!=0)
3108 break;
3110 match = ipfw_match_uid(&args->f_id, oif,
3111 cmd->opcode,
3112 (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3113 break;
3115 case O_RECV:
3116 match = iface_match(m->m_pkthdr.rcvif,
3117 (ipfw_insn_if *)cmd);
3118 break;
3120 case O_XMIT:
3121 match = iface_match(oif, (ipfw_insn_if *)cmd);
3122 break;
3124 case O_VIA:
3125 match = iface_match(oif ? oif :
3126 m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3127 break;
3129 case O_MACADDR2:
3130 if (args->eh != NULL) { /* have MAC header */
3131 uint32_t *want = (uint32_t *)
3132 ((ipfw_insn_mac *)cmd)->addr;
3133 uint32_t *mask = (uint32_t *)
3134 ((ipfw_insn_mac *)cmd)->mask;
3135 uint32_t *hdr = (uint32_t *)args->eh;
3137 match =
3138 (want[0] == (hdr[0] & mask[0]) &&
3139 want[1] == (hdr[1] & mask[1]) &&
3140 want[2] == (hdr[2] & mask[2]));
3142 break;
3144 case O_MAC_TYPE:
3145 if (args->eh != NULL) {
3146 uint16_t t =
3147 ntohs(args->eh->ether_type);
3148 uint16_t *p =
3149 ((ipfw_insn_u16 *)cmd)->ports;
3150 int i;
3152 /* Special vlan handling */
3153 if (m->m_flags & M_VLANTAG)
3154 t = ETHERTYPE_VLAN;
3156 for (i = cmdlen - 1; !match && i > 0;
3157 i--, p += 2) {
3158 match =
3159 (t >= p[0] && t <= p[1]);
3162 break;
3164 case O_FRAG:
3165 match = (hlen > 0 && lc.offset != 0);
3166 break;
3168 case O_IPFRAG:
3169 if (hlen > 0) {
3170 uint16_t off;
3172 if (args->eh != NULL)
3173 off = ntohs(ip->ip_off);
3174 else
3175 off = ip->ip_off;
3176 if (off & (IP_MF | IP_OFFMASK))
3177 match = 1;
3179 break;
3181 case O_IN: /* "out" is "not in" */
3182 match = (oif == NULL);
3183 break;
3185 case O_LAYER2:
3186 match = (args->eh != NULL);
3187 break;
3189 case O_PROTO:
3191 * We do not allow an arg of 0 so the
3192 * check of "proto" only suffices.
3194 match = (lc.proto == cmd->arg1);
3195 break;
3197 case O_IP_SRC:
3198 match = (hlen > 0 &&
3199 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3200 lc.src_ip.s_addr);
3201 break;
3203 case O_IP_SRC_MASK:
3204 match = (hlen > 0 &&
3205 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3206 (lc.src_ip.s_addr &
3207 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3208 break;
3210 case O_IP_SRC_ME:
3211 if (hlen > 0) {
3212 struct ifnet *tif;
3214 tif = INADDR_TO_IFP(&lc.src_ip);
3215 match = (tif != NULL);
3217 break;
3219 case O_IP_SRC_TABLE:
3220 match = ipfw_table_lookup(ctx, cmd->arg1,
3221 &lc.src_ip);
3222 break;
3224 case O_IP_SRC_IFIP:
3225 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3226 &lc.src_ip);
3227 break;
3229 case O_IP_DST_SET:
3230 case O_IP_SRC_SET:
3231 if (hlen > 0) {
3232 uint32_t *d = (uint32_t *)(cmd + 1);
3233 uint32_t addr =
3234 cmd->opcode == O_IP_DST_SET ?
3235 args->f_id.dst_ip :
3236 args->f_id.src_ip;
3238 if (addr < d[0])
3239 break;
3240 addr -= d[0]; /* subtract base */
3241 match =
3242 (addr < cmd->arg1) &&
3243 (d[1 + (addr >> 5)] &
3244 (1 << (addr & 0x1f)));
3246 break;
3248 case O_IP_DST:
3249 match = (hlen > 0 &&
3250 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3251 lc.dst_ip.s_addr);
3252 break;
3254 case O_IP_DST_MASK:
3255 match = (hlen > 0) &&
3256 (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3257 (lc.dst_ip.s_addr &
3258 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3259 break;
3261 case O_IP_DST_ME:
3262 if (hlen > 0) {
3263 struct ifnet *tif;
3265 tif = INADDR_TO_IFP(&lc.dst_ip);
3266 match = (tif != NULL);
3268 break;
3270 case O_IP_DST_TABLE:
3271 match = ipfw_table_lookup(ctx, cmd->arg1,
3272 &lc.dst_ip);
3273 break;
3275 case O_IP_DST_IFIP:
3276 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3277 &lc.dst_ip);
3278 break;
3280 case O_IP_SRCPORT:
3281 case O_IP_DSTPORT:
3283 * offset == 0 && proto != 0 is enough
3284 * to guarantee that we have an IPv4
3285 * packet with port info.
3287 if ((lc.proto==IPPROTO_UDP ||
3288 lc.proto==IPPROTO_TCP)
3289 && lc.offset == 0) {
3290 uint16_t x =
3291 (cmd->opcode == O_IP_SRCPORT) ?
3292 lc.src_port : lc.dst_port;
3293 uint16_t *p =
3294 ((ipfw_insn_u16 *)cmd)->ports;
3295 int i;
3297 for (i = cmdlen - 1; !match && i > 0;
3298 i--, p += 2) {
3299 match =
3300 (x >= p[0] && x <= p[1]);
3303 break;
3305 case O_ICMPTYPE:
3306 match = (lc.offset == 0 &&
3307 lc.proto==IPPROTO_ICMP &&
3308 icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3309 break;
3311 case O_IPOPT:
3312 match = (hlen > 0 && ipopts_match(ip, cmd));
3313 break;
3315 case O_IPVER:
3316 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3317 break;
3319 case O_IPTTL:
3320 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3321 break;
3323 case O_IPID:
3324 match = (hlen > 0 &&
3325 cmd->arg1 == ntohs(ip->ip_id));
3326 break;
3328 case O_IPLEN:
3329 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3330 break;
3332 case O_IPPRECEDENCE:
3333 match = (hlen > 0 &&
3334 (cmd->arg1 == (ip->ip_tos & 0xe0)));
3335 break;
3337 case O_IPTOS:
3338 match = (hlen > 0 &&
3339 flags_match(cmd, ip->ip_tos));
3340 break;
3342 case O_TCPFLAGS:
3343 match = (lc.proto == IPPROTO_TCP &&
3344 lc.offset == 0 &&
3345 flags_match(cmd,
3346 L3HDR(struct tcphdr,ip)->th_flags));
3347 break;
3349 case O_TCPOPTS:
3350 match = (lc.proto == IPPROTO_TCP &&
3351 lc.offset == 0 && tcpopts_match(ip, cmd));
3352 break;
3354 case O_TCPSEQ:
3355 match = (lc.proto == IPPROTO_TCP &&
3356 lc.offset == 0 &&
3357 ((ipfw_insn_u32 *)cmd)->d[0] ==
3358 L3HDR(struct tcphdr,ip)->th_seq);
3359 break;
3361 case O_TCPACK:
3362 match = (lc.proto == IPPROTO_TCP &&
3363 lc.offset == 0 &&
3364 ((ipfw_insn_u32 *)cmd)->d[0] ==
3365 L3HDR(struct tcphdr,ip)->th_ack);
3366 break;
3368 case O_TCPWIN:
3369 match = (lc.proto == IPPROTO_TCP &&
3370 lc.offset == 0 &&
3371 cmd->arg1 ==
3372 L3HDR(struct tcphdr,ip)->th_win);
3373 break;
3375 case O_ESTAB:
3376 /* reject packets which have SYN only */
3377 /* XXX should i also check for TH_ACK ? */
3378 match = (lc.proto == IPPROTO_TCP &&
3379 lc.offset == 0 &&
3380 (L3HDR(struct tcphdr,ip)->th_flags &
3381 (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3382 break;
3384 case O_LOG:
3385 if (fw_verbose) {
3386 ipfw_log(ctx, f, hlen, args->eh, m,
3387 oif);
3389 match = 1;
3390 break;
3392 case O_PROB:
3393 match = (krandom() <
3394 ((ipfw_insn_u32 *)cmd)->d[0]);
3395 break;
3398 * The second set of opcodes represents 'actions',
3399 * i.e. the terminal part of a rule once the packet
3400 * matches all previous patterns.
3401 * Typically there is only one action for each rule,
3402 * and the opcode is stored at the end of the rule
3403 * (but there are exceptions -- see below).
3405 * In general, here we set retval and terminate the
3406 * outer loop (would be a 'break 3' in some language,
3407 * but we need to do a 'goto done').
3409 * Exceptions:
3410 * O_COUNT and O_SKIPTO actions:
3411 * instead of terminating, we jump to the next rule
3412 * ('goto next_rule', equivalent to a 'break 2'),
3413 * or to the SKIPTO target ('goto again' after
3414 * having set f, cmd and l), respectively.
3416 * O_LIMIT and O_KEEP_STATE: these opcodes are
3417 * not real 'actions', and are stored right
3418 * before the 'action' part of the rule.
3419 * These opcodes try to install an entry in the
3420 * state tables; if successful, we continue with
3421 * the next opcode (match=1; break;), otherwise
3422 * the packet must be dropped ('goto done' after
3423 * setting retval). If static rules are changed
3424 * during the state installation, the packet will
3425 * be dropped and rule's stats will not beupdated
3426 * ('return IP_FW_DENY').
3428 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3429 * cause a lookup of the state table, and a jump
3430 * to the 'action' part of the parent rule
3431 * ('goto check_body') if an entry is found, or
3432 * (CHECK_STATE only) a jump to the next rule if
3433 * the entry is not found ('goto next_rule').
3434 * The result of the lookup is cached to make
3435 * further instances of these opcodes are
3436 * effectively NOPs. If static rules are changed
3437 * during the state looking up, the packet will
3438 * be dropped and rule's stats will not be updated
3439 * ('return IP_FW_DENY').
3441 case O_LIMIT:
3442 case O_KEEP_STATE:
3443 if (ipfw_state_install(ctx, f,
3444 (ipfw_insn_limit *)cmd, args,
3445 (lc.offset == 0 &&
3446 lc.proto == IPPROTO_TCP) ?
3447 L3HDR(struct tcphdr, ip) : NULL)) {
3448 retval = IP_FW_DENY;
3449 goto done; /* error/limit violation */
3451 match = 1;
3452 break;
3454 case O_PROBE_STATE:
3455 case O_CHECK_STATE:
3457 * States are checked at the first keep-state
3458 * check-state occurrence, with the result
3459 * being stored in dyn_dir. The compiler
3460 * introduces a PROBE_STATE instruction for
3461 * us when we have a KEEP_STATE/LIMIT (because
3462 * PROBE_STATE needs to be run first).
3464 if (dyn_dir == MATCH_UNKNOWN) {
3465 dyn_f = ipfw_state_lookup_rule(ctx,
3466 &args->f_id, &dyn_dir,
3467 (lc.offset == 0 &&
3468 lc.proto == IPPROTO_TCP) ?
3469 L3HDR(struct tcphdr, ip) : NULL,
3470 lc.ip_len);
3471 if (dyn_f != NULL) {
3473 * Found a rule from a state;
3474 * jump to the 'action' part
3475 * of the rule.
3477 f = dyn_f;
3478 cmd = ACTION_PTR(f);
3479 l = f->cmd_len - f->act_ofs;
3480 goto check_body;
3484 * State not found. If CHECK_STATE, skip to
3485 * next rule, if PROBE_STATE just ignore and
3486 * continue with next opcode.
3488 if (cmd->opcode == O_CHECK_STATE)
3489 goto next_rule;
3490 match = 1;
3491 break;
3493 case O_ACCEPT:
3494 retval = IP_FW_PASS; /* accept */
3495 goto done;
3497 case O_DEFRAG:
3498 if (f->cross_rules == NULL) {
3500 * This rule was not completely setup;
3501 * move on to the next rule.
3503 goto next_rule;
3507 * Don't defrag for l2 packets, output packets
3508 * or non-fragments.
3510 if (oif != NULL || args->eh != NULL ||
3511 (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
3512 goto next_rule;
3514 ctx->ipfw_frags++;
3515 m = ip_reass(m);
3516 args->m = m;
3517 if (m == NULL) {
3518 retval = IP_FW_PASS;
3519 goto done;
3521 ctx->ipfw_defraged++;
3522 KASSERT((m->m_flags & M_HASH) == 0,
3523 ("hash not cleared"));
3525 /* Update statistics */
3526 f->pcnt++;
3527 f->bcnt += lc.ip_len;
3528 f->timestamp = time_second;
3530 ip = mtod(m, struct ip *);
3531 hlen = ip->ip_hl << 2;
3532 ip->ip_len += hlen;
3534 ip->ip_len = htons(ip->ip_len);
3535 ip->ip_off = htons(ip->ip_off);
3537 ip_hashfn(&m, 0);
3538 args->m = m;
3539 if (m == NULL)
3540 goto pullup_failed;
3542 KASSERT(m->m_flags & M_HASH, ("no hash"));
3543 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
3544 if (cpuid != mycpuid) {
3546 * NOTE:
3547 * ip_len/ip_off are in network byte
3548 * order.
3550 ctx->ipfw_defrag_remote++;
3551 args->rule = f;
3552 return (IP_FW_CONTINUE);
3555 /* 'm' might be changed by ip_hashfn(). */
3556 ip = mtod(m, struct ip *);
3557 ip->ip_len = ntohs(ip->ip_len);
3558 ip->ip_off = ntohs(ip->ip_off);
3560 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3561 if (m == NULL)
3562 goto pullup_failed;
3564 /* Move on. */
3565 goto next_rule;
3567 case O_PIPE:
3568 case O_QUEUE:
3569 args->rule = f; /* report matching rule */
3570 args->cookie = cmd->arg1;
3571 retval = IP_FW_DUMMYNET;
3572 goto done;
3574 case O_DIVERT:
3575 case O_TEE:
3576 if (args->eh) /* not on layer 2 */
3577 break;
3579 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
3580 sizeof(*divinfo), M_INTWAIT | M_NULLOK);
3581 if (mtag == NULL) {
3582 retval = IP_FW_DENY;
3583 goto done;
3585 divinfo = m_tag_data(mtag);
3587 divinfo->skipto = f->rulenum;
3588 divinfo->port = cmd->arg1;
3589 divinfo->tee = (cmd->opcode == O_TEE);
3590 m_tag_prepend(m, mtag);
3592 args->cookie = cmd->arg1;
3593 retval = (cmd->opcode == O_DIVERT) ?
3594 IP_FW_DIVERT : IP_FW_TEE;
3595 goto done;
3597 case O_COUNT:
3598 case O_SKIPTO:
3599 f->pcnt++; /* update stats */
3600 f->bcnt += lc.ip_len;
3601 f->timestamp = time_second;
3602 if (cmd->opcode == O_COUNT)
3603 goto next_rule;
3604 /* handle skipto */
3605 if (f->next_rule == NULL)
3606 lookup_next_rule(f);
3607 f = f->next_rule;
3608 goto again;
3610 case O_REJECT:
3612 * Drop the packet and send a reject notice
3613 * if the packet is not ICMP (or is an ICMP
3614 * query), and it is not multicast/broadcast.
3616 if (hlen > 0 &&
3617 (lc.proto != IPPROTO_ICMP ||
3618 is_icmp_query(ip)) &&
3619 !(m->m_flags & (M_BCAST|M_MCAST)) &&
3620 !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
3621 send_reject(args, cmd->arg1,
3622 lc.offset, lc.ip_len);
3623 retval = IP_FW_DENY;
3624 goto done;
3626 /* FALLTHROUGH */
3627 case O_DENY:
3628 retval = IP_FW_DENY;
3629 goto done;
3631 case O_FORWARD_IP:
3632 if (args->eh) /* not valid on layer2 pkts */
3633 break;
3634 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
3635 struct sockaddr_in *sin;
3637 mtag = m_tag_get(PACKET_TAG_IPFORWARD,
3638 sizeof(*sin), M_INTWAIT | M_NULLOK);
3639 if (mtag == NULL) {
3640 retval = IP_FW_DENY;
3641 goto done;
3643 sin = m_tag_data(mtag);
3645 /* Structure copy */
3646 *sin = ((ipfw_insn_sa *)cmd)->sa;
3648 m_tag_prepend(m, mtag);
3649 m->m_pkthdr.fw_flags |=
3650 IPFORWARD_MBUF_TAGGED;
3651 m->m_pkthdr.fw_flags &=
3652 ~BRIDGE_MBUF_TAGGED;
3654 retval = IP_FW_PASS;
3655 goto done;
3657 default:
3658 panic("-- unknown opcode %d", cmd->opcode);
3659 } /* end of switch() on opcodes */
3661 if (cmd->len & F_NOT)
3662 match = !match;
3664 if (match) {
3665 if (cmd->len & F_OR)
3666 skip_or = 1;
3667 } else {
3668 if (!(cmd->len & F_OR)) /* not an OR block, */
3669 break; /* try next rule */
3672 } /* end of inner for, scan opcodes */
3674 next_rule:; /* try next rule */
3676 } /* end of outer for, scan rules */
3677 kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
3678 return IP_FW_DENY;
3680 done:
3681 /* Update statistics */
3682 f->pcnt++;
3683 f->bcnt += lc.ip_len;
3684 f->timestamp = time_second;
3685 return retval;
3687 pullup_failed:
3688 if (fw_verbose)
3689 kprintf("pullup failed\n");
3690 return IP_FW_DENY;
3693 static struct mbuf *
3694 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
3696 struct m_tag *mtag;
3697 struct dn_pkt *pkt;
3698 ipfw_insn *cmd;
3699 const struct ipfw_flow_id *id;
3700 struct dn_flow_id *fid;
3702 M_ASSERTPKTHDR(m);
3704 mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
3705 M_INTWAIT | M_NULLOK);
3706 if (mtag == NULL) {
3707 m_freem(m);
3708 return (NULL);
3710 m_tag_prepend(m, mtag);
3712 pkt = m_tag_data(mtag);
3713 bzero(pkt, sizeof(*pkt));
3715 cmd = fwa->rule->cmd + fwa->rule->act_ofs;
3716 if (cmd->opcode == O_LOG)
3717 cmd += F_LEN(cmd);
3718 KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
3719 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
3721 pkt->dn_m = m;
3722 pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
3723 pkt->ifp = fwa->oif;
3724 pkt->pipe_nr = pipe_nr;
3726 pkt->cpuid = mycpuid;
3727 pkt->msgport = netisr_curport();
3729 id = &fwa->f_id;
3730 fid = &pkt->id;
3731 fid->fid_dst_ip = id->dst_ip;
3732 fid->fid_src_ip = id->src_ip;
3733 fid->fid_dst_port = id->dst_port;
3734 fid->fid_src_port = id->src_port;
3735 fid->fid_proto = id->proto;
3736 fid->fid_flags = id->flags;
3738 ipfw_ref_rule(fwa->rule);
3739 pkt->dn_priv = fwa->rule;
3740 pkt->dn_unref_priv = ipfw_unref_rule;
3742 if (cmd->opcode == O_PIPE)
3743 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
3745 m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
3746 return (m);
3750 * When a rule is added/deleted, clear the next_rule pointers in all rules.
3751 * These will be reconstructed on the fly as packets are matched.
3753 static void
3754 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
3756 struct ip_fw *rule;
3758 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
3759 rule->next_rule = NULL;
3762 static __inline void
3763 ipfw_inc_static_count(struct ip_fw *rule)
3765 /* Static rule's counts are updated only on CPU0 */
3766 KKASSERT(mycpuid == 0);
3768 static_count++;
3769 static_ioc_len += IOC_RULESIZE(rule);
3772 static __inline void
3773 ipfw_dec_static_count(struct ip_fw *rule)
3775 int l = IOC_RULESIZE(rule);
3777 /* Static rule's counts are updated only on CPU0 */
3778 KKASSERT(mycpuid == 0);
3780 KASSERT(static_count > 0, ("invalid static count %u", static_count));
3781 static_count--;
3783 KASSERT(static_ioc_len >= l,
3784 ("invalid static len %u", static_ioc_len));
3785 static_ioc_len -= l;
3788 static void
3789 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
3791 if (fwmsg->sibling != NULL) {
3792 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
3793 fwmsg->sibling->sibling = rule;
3795 fwmsg->sibling = rule;
3798 static struct ip_fw *
3799 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3801 struct ip_fw *rule;
3803 rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
3805 rule->act_ofs = ioc_rule->act_ofs;
3806 rule->cmd_len = ioc_rule->cmd_len;
3807 rule->rulenum = ioc_rule->rulenum;
3808 rule->set = ioc_rule->set;
3809 rule->usr_flags = ioc_rule->usr_flags;
3811 bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
3813 rule->refcnt = 1;
3814 rule->cpuid = mycpuid;
3815 rule->rule_flags = rule_flags;
3817 return rule;
3820 static void
3821 ipfw_add_rule_dispatch(netmsg_t nmsg)
3823 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3824 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3825 struct ip_fw *rule;
3827 ASSERT_NETISR_NCPUS(mycpuid);
3829 rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
3832 * Insert rule into the pre-determined position
3834 if (fwmsg->prev_rule != NULL) {
3835 struct ip_fw *prev, *next;
3837 prev = fwmsg->prev_rule;
3838 KKASSERT(prev->cpuid == mycpuid);
3840 next = fwmsg->next_rule;
3841 KKASSERT(next->cpuid == mycpuid);
3843 rule->next = next;
3844 prev->next = rule;
3847 * Move to the position on the next CPU
3848 * before the msg is forwarded.
3850 fwmsg->prev_rule = prev->sibling;
3851 fwmsg->next_rule = next->sibling;
3852 } else {
3853 KKASSERT(fwmsg->next_rule == NULL);
3854 rule->next = ctx->ipfw_layer3_chain;
3855 ctx->ipfw_layer3_chain = rule;
3858 /* Link rule CPU sibling */
3859 ipfw_link_sibling(fwmsg, rule);
3861 ipfw_flush_rule_ptrs(ctx);
3863 if (mycpuid == 0) {
3864 /* Statistics only need to be updated once */
3865 ipfw_inc_static_count(rule);
3867 /* Return the rule on CPU0 */
3868 nmsg->lmsg.u.ms_resultp = rule;
3871 if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
3872 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
3874 if (fwmsg->cross_rules != NULL) {
3875 /* Save rules for later use. */
3876 fwmsg->cross_rules[mycpuid] = rule;
3879 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3882 static void
3883 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
3885 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3886 struct ip_fw *rule = fwmsg->sibling;
3887 int sz = sizeof(struct ip_fw *) * netisr_ncpus;
3889 ASSERT_NETISR_NCPUS(mycpuid);
3890 KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
3891 ("not crossref rule"));
3893 rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
3894 memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
3896 fwmsg->sibling = rule->sibling;
3897 netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
3901 * Add a new rule to the list. Copy the rule into a malloc'ed area,
3902 * then possibly create a rule number and add the rule to the list.
3903 * Update the rule_number in the input struct so the caller knows
3904 * it as well.
3906 static void
3907 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3909 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3910 struct netmsg_ipfw fwmsg;
3911 struct ip_fw *f, *prev, *rule;
3913 ASSERT_NETISR0;
3916 * If rulenum is 0, find highest numbered rule before the
3917 * default rule, and add rule number incremental step.
3919 if (ioc_rule->rulenum == 0) {
3920 int step = autoinc_step;
3922 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
3923 step <= IPFW_AUTOINC_STEP_MAX);
3926 * Locate the highest numbered rule before default
3928 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
3929 if (f->rulenum == IPFW_DEFAULT_RULE)
3930 break;
3931 ioc_rule->rulenum = f->rulenum;
3933 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
3934 ioc_rule->rulenum += step;
3936 KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
3937 ioc_rule->rulenum != 0,
3938 ("invalid rule num %d", ioc_rule->rulenum));
3941 * Now find the right place for the new rule in the sorted list.
3943 for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
3944 prev = f, f = f->next) {
3945 if (f->rulenum > ioc_rule->rulenum) {
3946 /* Found the location */
3947 break;
3950 KASSERT(f != NULL, ("no default rule?!"));
3953 * Duplicate the rule onto each CPU.
3954 * The rule duplicated on CPU0 will be returned.
3956 bzero(&fwmsg, sizeof(fwmsg));
3957 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3958 ipfw_add_rule_dispatch);
3959 fwmsg.ioc_rule = ioc_rule;
3960 fwmsg.prev_rule = prev;
3961 fwmsg.next_rule = prev == NULL ? NULL : f;
3962 fwmsg.rule_flags = rule_flags;
3963 if (rule_flags & IPFW_RULE_F_CROSSREF) {
3964 fwmsg.cross_rules = kmalloc(
3965 sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
3966 M_WAITOK | M_ZERO);
3969 netisr_domsg_global(&fwmsg.base);
3970 KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
3972 rule = fwmsg.base.lmsg.u.ms_resultp;
3973 KKASSERT(rule != NULL && rule->cpuid == mycpuid);
3975 if (fwmsg.cross_rules != NULL) {
3976 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
3977 MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
3978 fwmsg.sibling = rule;
3979 netisr_domsg_global(&fwmsg.base);
3980 KKASSERT(fwmsg.sibling == NULL);
3982 kfree(fwmsg.cross_rules, M_TEMP);
3984 #ifdef KLD_MODULE
3985 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
3986 #endif
3989 DPRINTF("++ installed rule %d, static count now %d\n",
3990 rule->rulenum, static_count);
3994 * Free storage associated with a static rule (including derived
3995 * states/tracks).
3996 * The caller is in charge of clearing rule pointers to avoid
3997 * dangling pointers.
3998 * @return a pointer to the next entry.
3999 * Arguments are not checked, so they better be correct.
4001 static struct ip_fw *
4002 ipfw_delete_rule(struct ipfw_context *ctx,
4003 struct ip_fw *prev, struct ip_fw *rule)
4005 struct ip_fw *n;
4007 n = rule->next;
4008 if (prev == NULL)
4009 ctx->ipfw_layer3_chain = n;
4010 else
4011 prev->next = n;
4013 /* Mark the rule as invalid */
4014 rule->rule_flags |= IPFW_RULE_F_INVALID;
4015 rule->next_rule = NULL;
4016 rule->sibling = NULL;
4017 #ifdef foo
4018 /* Don't reset cpuid here; keep various assertion working */
4019 rule->cpuid = -1;
4020 #endif
4022 /* Statistics only need to be updated once */
4023 if (mycpuid == 0)
4024 ipfw_dec_static_count(rule);
4026 if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4027 /* Try to free this rule */
4028 ipfw_free_rule(rule);
4029 } else {
4030 /* TODO: check staging area. */
4031 if (mycpuid == 0) {
4032 rule->next = ipfw_gd.ipfw_crossref_free;
4033 ipfw_gd.ipfw_crossref_free = rule;
4037 /* Return the next rule */
4038 return n;
4041 static void
4042 ipfw_flush_dispatch(netmsg_t nmsg)
4044 int kill_default = nmsg->lmsg.u.ms_result;
4045 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4046 struct ip_fw *rule;
4048 ASSERT_NETISR_NCPUS(mycpuid);
4051 * Flush states.
4053 ipfw_state_flush(ctx, NULL);
4054 KASSERT(ctx->ipfw_state_cnt == 0,
4055 ("%d pcpu states remain", ctx->ipfw_state_cnt));
4056 ctx->ipfw_state_loosecnt = 0;
4057 ctx->ipfw_state_lastexp = 0;
4060 * Flush tracks.
4062 ipfw_track_flush(ctx, NULL);
4063 ctx->ipfw_track_lastexp = 0;
4064 if (ctx->ipfw_trkcnt_spare != NULL) {
4065 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4066 ctx->ipfw_trkcnt_spare = NULL;
4069 ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4071 while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4072 (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4073 ipfw_delete_rule(ctx, NULL, rule);
4075 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4079 * Deletes all rules from a chain (including the default rule
4080 * if the second argument is set).
4082 static void
4083 ipfw_flush(int kill_default)
4085 struct netmsg_base nmsg;
4086 #ifdef INVARIANTS
4087 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4088 int state_cnt;
4089 #endif
4091 ASSERT_NETISR0;
4094 * If 'kill_default' then caller has done the necessary
4095 * msgport syncing; unnecessary to do it again.
4097 if (!kill_default) {
4099 * Let ipfw_chk() know the rules are going to
4100 * be flushed, so it could jump directly to
4101 * the default rule.
4103 ipfw_flushing = 1;
4104 /* XXX use priority sync */
4105 netmsg_service_sync();
4109 * Press the 'flush' button
4111 bzero(&nmsg, sizeof(nmsg));
4112 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4113 ipfw_flush_dispatch);
4114 nmsg.lmsg.u.ms_result = kill_default;
4115 netisr_domsg_global(&nmsg);
4116 ipfw_gd.ipfw_state_loosecnt = 0;
4117 ipfw_gd.ipfw_state_globexp = 0;
4118 ipfw_gd.ipfw_track_globexp = 0;
4120 #ifdef INVARIANTS
4121 state_cnt = ipfw_state_cntcoll();
4122 KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4124 KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4125 ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4127 if (kill_default) {
4128 KASSERT(static_count == 0,
4129 ("%u static rules remain", static_count));
4130 KASSERT(static_ioc_len == 0,
4131 ("%u bytes of static rules remain", static_ioc_len));
4132 } else {
4133 KASSERT(static_count == 1,
4134 ("%u static rules remain", static_count));
4135 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4136 ("%u bytes of static rules remain, should be %lu",
4137 static_ioc_len,
4138 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4140 #endif
4142 /* Flush is done */
4143 ipfw_flushing = 0;
4146 static void
4147 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4149 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4150 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4151 struct ip_fw *rule, *prev;
4153 ASSERT_NETISR_NCPUS(mycpuid);
4155 rule = dmsg->start_rule;
4156 KKASSERT(rule->cpuid == mycpuid);
4157 dmsg->start_rule = rule->sibling;
4159 prev = dmsg->prev_rule;
4160 if (prev != NULL) {
4161 KKASSERT(prev->cpuid == mycpuid);
4164 * Move to the position on the next CPU
4165 * before the msg is forwarded.
4167 dmsg->prev_rule = prev->sibling;
4171 * flush pointers outside the loop, then delete all matching
4172 * rules. 'prev' remains the same throughout the cycle.
4174 ipfw_flush_rule_ptrs(ctx);
4175 while (rule && rule->rulenum == dmsg->rulenum) {
4176 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4177 /* Flush states generated by this rule. */
4178 ipfw_state_flush(ctx, rule);
4180 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4181 /* Flush tracks generated by this rule. */
4182 ipfw_track_flush(ctx, rule);
4184 rule = ipfw_delete_rule(ctx, prev, rule);
4187 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4190 static int
4191 ipfw_alt_delete_rule(uint16_t rulenum)
4193 struct ip_fw *prev, *rule;
4194 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4195 struct netmsg_del dmsg;
4197 ASSERT_NETISR0;
4200 * Locate first rule to delete
4202 for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4203 rule && rule->rulenum < rulenum;
4204 prev = rule, rule = rule->next)
4205 ; /* EMPTY */
4206 if (rule->rulenum != rulenum)
4207 return EINVAL;
4210 * Get rid of the rule duplications on all CPUs
4212 bzero(&dmsg, sizeof(dmsg));
4213 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4214 ipfw_alt_delete_rule_dispatch);
4215 dmsg.prev_rule = prev;
4216 dmsg.start_rule = rule;
4217 dmsg.rulenum = rulenum;
4219 netisr_domsg_global(&dmsg.base);
4220 KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4221 return 0;
4224 static void
4225 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4227 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4228 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4229 struct ip_fw *prev, *rule;
4230 #ifdef INVARIANTS
4231 int del = 0;
4232 #endif
4234 ASSERT_NETISR_NCPUS(mycpuid);
4236 ipfw_flush_rule_ptrs(ctx);
4238 prev = NULL;
4239 rule = ctx->ipfw_layer3_chain;
4240 while (rule != NULL) {
4241 if (rule->set == dmsg->from_set) {
4242 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4243 /* Flush states generated by this rule. */
4244 ipfw_state_flush(ctx, rule);
4246 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4247 /* Flush tracks generated by this rule. */
4248 ipfw_track_flush(ctx, rule);
4250 rule = ipfw_delete_rule(ctx, prev, rule);
4251 #ifdef INVARIANTS
4252 del = 1;
4253 #endif
4254 } else {
4255 prev = rule;
4256 rule = rule->next;
4259 KASSERT(del, ("no match set?!"));
4261 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4264 static int
4265 ipfw_alt_delete_ruleset(uint8_t set)
4267 struct netmsg_del dmsg;
4268 int del;
4269 struct ip_fw *rule;
4270 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4272 ASSERT_NETISR0;
4275 * Check whether the 'set' exists. If it exists,
4276 * then check whether any rules within the set will
4277 * try to create states.
4279 del = 0;
4280 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4281 if (rule->set == set)
4282 del = 1;
4284 if (!del)
4285 return 0; /* XXX EINVAL? */
4288 * Delete this set
4290 bzero(&dmsg, sizeof(dmsg));
4291 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4292 ipfw_alt_delete_ruleset_dispatch);
4293 dmsg.from_set = set;
4294 netisr_domsg_global(&dmsg.base);
4296 return 0;
4299 static void
4300 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
4302 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4303 struct ip_fw *rule;
4305 ASSERT_NETISR_NCPUS(mycpuid);
4307 rule = dmsg->start_rule;
4308 KKASSERT(rule->cpuid == mycpuid);
4311 * Move to the position on the next CPU
4312 * before the msg is forwarded.
4314 dmsg->start_rule = rule->sibling;
4316 while (rule && rule->rulenum <= dmsg->rulenum) {
4317 if (rule->rulenum == dmsg->rulenum)
4318 rule->set = dmsg->to_set;
4319 rule = rule->next;
4321 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4324 static int
4325 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
4327 struct netmsg_del dmsg;
4328 struct netmsg_base *nmsg;
4329 struct ip_fw *rule;
4330 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4332 ASSERT_NETISR0;
4335 * Locate first rule to move
4337 for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
4338 rule = rule->next) {
4339 if (rule->rulenum == rulenum && rule->set != set)
4340 break;
4342 if (rule == NULL || rule->rulenum > rulenum)
4343 return 0; /* XXX error? */
4345 bzero(&dmsg, sizeof(dmsg));
4346 nmsg = &dmsg.base;
4347 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4348 ipfw_alt_move_rule_dispatch);
4349 dmsg.start_rule = rule;
4350 dmsg.rulenum = rulenum;
4351 dmsg.to_set = set;
4353 netisr_domsg_global(nmsg);
4354 KKASSERT(dmsg.start_rule == NULL);
4355 return 0;
4358 static void
4359 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
4361 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4362 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4363 struct ip_fw *rule;
4365 ASSERT_NETISR_NCPUS(mycpuid);
4367 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4368 if (rule->set == dmsg->from_set)
4369 rule->set = dmsg->to_set;
4371 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4374 static int
4375 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
4377 struct netmsg_del dmsg;
4378 struct netmsg_base *nmsg;
4380 ASSERT_NETISR0;
4382 bzero(&dmsg, sizeof(dmsg));
4383 nmsg = &dmsg.base;
4384 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4385 ipfw_alt_move_ruleset_dispatch);
4386 dmsg.from_set = from_set;
4387 dmsg.to_set = to_set;
4389 netisr_domsg_global(nmsg);
4390 return 0;
4393 static void
4394 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
4396 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4397 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4398 struct ip_fw *rule;
4400 ASSERT_NETISR_NCPUS(mycpuid);
4402 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4403 if (rule->set == dmsg->from_set)
4404 rule->set = dmsg->to_set;
4405 else if (rule->set == dmsg->to_set)
4406 rule->set = dmsg->from_set;
4408 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4411 static int
4412 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
4414 struct netmsg_del dmsg;
4415 struct netmsg_base *nmsg;
4417 ASSERT_NETISR0;
4419 bzero(&dmsg, sizeof(dmsg));
4420 nmsg = &dmsg.base;
4421 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4422 ipfw_alt_swap_ruleset_dispatch);
4423 dmsg.from_set = set1;
4424 dmsg.to_set = set2;
4426 netisr_domsg_global(nmsg);
4427 return 0;
4431 * Remove all rules with given number, and also do set manipulation.
4433 * The argument is an uint32_t. The low 16 bit are the rule or set number,
4434 * the next 8 bits are the new set, the top 8 bits are the command:
4436 * 0 delete rules with given number
4437 * 1 delete rules with given set number
4438 * 2 move rules with given number to new set
4439 * 3 move rules with given set number to new set
4440 * 4 swap sets with given numbers
4442 static int
4443 ipfw_ctl_alter(uint32_t arg)
4445 uint16_t rulenum;
4446 uint8_t cmd, new_set;
4447 int error = 0;
4449 ASSERT_NETISR0;
4451 rulenum = arg & 0xffff;
4452 cmd = (arg >> 24) & 0xff;
4453 new_set = (arg >> 16) & 0xff;
4455 if (cmd > 4)
4456 return EINVAL;
4457 if (new_set >= IPFW_DEFAULT_SET)
4458 return EINVAL;
4459 if (cmd == 0 || cmd == 2) {
4460 if (rulenum == IPFW_DEFAULT_RULE)
4461 return EINVAL;
4462 } else {
4463 if (rulenum >= IPFW_DEFAULT_SET)
4464 return EINVAL;
4467 switch (cmd) {
4468 case 0: /* delete rules with given number */
4469 error = ipfw_alt_delete_rule(rulenum);
4470 break;
4472 case 1: /* delete all rules with given set number */
4473 error = ipfw_alt_delete_ruleset(rulenum);
4474 break;
4476 case 2: /* move rules with given number to new set */
4477 error = ipfw_alt_move_rule(rulenum, new_set);
4478 break;
4480 case 3: /* move rules with given set number to new set */
4481 error = ipfw_alt_move_ruleset(rulenum, new_set);
4482 break;
4484 case 4: /* swap two sets */
4485 error = ipfw_alt_swap_ruleset(rulenum, new_set);
4486 break;
4488 return error;
4492 * Clear counters for a specific rule.
4494 static void
4495 clear_counters(struct ip_fw *rule, int log_only)
4497 ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
4499 if (log_only == 0) {
4500 rule->bcnt = rule->pcnt = 0;
4501 rule->timestamp = 0;
4503 if (l->o.opcode == O_LOG)
4504 l->log_left = l->max_log;
4507 static void
4508 ipfw_zero_entry_dispatch(netmsg_t nmsg)
4510 struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
4511 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4512 struct ip_fw *rule;
4514 ASSERT_NETISR_NCPUS(mycpuid);
4516 if (zmsg->rulenum == 0) {
4517 KKASSERT(zmsg->start_rule == NULL);
4519 ctx->ipfw_norule_counter = 0;
4520 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4521 clear_counters(rule, zmsg->log_only);
4522 } else {
4523 struct ip_fw *start = zmsg->start_rule;
4525 KKASSERT(start->cpuid == mycpuid);
4526 KKASSERT(start->rulenum == zmsg->rulenum);
4529 * We can have multiple rules with the same number, so we
4530 * need to clear them all.
4532 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
4533 rule = rule->next)
4534 clear_counters(rule, zmsg->log_only);
4537 * Move to the position on the next CPU
4538 * before the msg is forwarded.
4540 zmsg->start_rule = start->sibling;
4542 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4546 * Reset some or all counters on firewall rules.
4547 * @arg frwl is null to clear all entries, or contains a specific
4548 * rule number.
4549 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
4551 static int
4552 ipfw_ctl_zero_entry(int rulenum, int log_only)
4554 struct netmsg_zent zmsg;
4555 struct netmsg_base *nmsg;
4556 const char *msg;
4557 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4559 ASSERT_NETISR0;
4561 bzero(&zmsg, sizeof(zmsg));
4562 nmsg = &zmsg.base;
4563 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4564 ipfw_zero_entry_dispatch);
4565 zmsg.log_only = log_only;
4567 if (rulenum == 0) {
4568 msg = log_only ? "ipfw: All logging counts reset.\n"
4569 : "ipfw: Accounting cleared.\n";
4570 } else {
4571 struct ip_fw *rule;
4574 * Locate the first rule with 'rulenum'
4576 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4577 if (rule->rulenum == rulenum)
4578 break;
4580 if (rule == NULL) /* we did not find any matching rules */
4581 return (EINVAL);
4582 zmsg.start_rule = rule;
4583 zmsg.rulenum = rulenum;
4585 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
4586 : "ipfw: Entry %d cleared.\n";
4588 netisr_domsg_global(nmsg);
4589 KKASSERT(zmsg.start_rule == NULL);
4591 if (fw_verbose)
4592 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
4593 return (0);
4597 * Check validity of the structure before insert.
4598 * Fortunately rules are simple, so this mostly need to check rule sizes.
4600 static int
4601 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
4603 int l, cmdlen = 0;
4604 int have_action = 0;
4605 ipfw_insn *cmd;
4607 *rule_flags = 0;
4609 /* Check for valid size */
4610 if (size < sizeof(*rule)) {
4611 kprintf("ipfw: rule too short\n");
4612 return EINVAL;
4614 l = IOC_RULESIZE(rule);
4615 if (l != size) {
4616 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
4617 return EINVAL;
4620 /* Check rule number */
4621 if (rule->rulenum == IPFW_DEFAULT_RULE) {
4622 kprintf("ipfw: invalid rule number\n");
4623 return EINVAL;
4627 * Now go for the individual checks. Very simple ones, basically only
4628 * instruction sizes.
4630 for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
4631 l -= cmdlen, cmd += cmdlen) {
4632 cmdlen = F_LEN(cmd);
4633 if (cmdlen > l) {
4634 kprintf("ipfw: opcode %d size truncated\n",
4635 cmd->opcode);
4636 return EINVAL;
4639 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
4641 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT) {
4642 /* This rule will generate states. */
4643 *rule_flags |= IPFW_RULE_F_GENSTATE;
4644 if (cmd->opcode == O_LIMIT)
4645 *rule_flags |= IPFW_RULE_F_GENTRACK;
4647 if (cmd->opcode == O_DEFRAG)
4648 *rule_flags |= IPFW_RULE_F_CROSSREF;
4649 if (cmd->opcode == O_IP_SRC_IFIP ||
4650 cmd->opcode == O_IP_DST_IFIP) {
4651 *rule_flags |= IPFW_RULE_F_DYNIFADDR;
4652 cmd->arg1 &= IPFW_IFIP_SETTINGS;
4655 switch (cmd->opcode) {
4656 case O_NOP:
4657 case O_PROBE_STATE:
4658 case O_KEEP_STATE:
4659 case O_PROTO:
4660 case O_IP_SRC_ME:
4661 case O_IP_DST_ME:
4662 case O_LAYER2:
4663 case O_IN:
4664 case O_FRAG:
4665 case O_IPFRAG:
4666 case O_IPOPT:
4667 case O_IPLEN:
4668 case O_IPID:
4669 case O_IPTOS:
4670 case O_IPPRECEDENCE:
4671 case O_IPTTL:
4672 case O_IPVER:
4673 case O_TCPWIN:
4674 case O_TCPFLAGS:
4675 case O_TCPOPTS:
4676 case O_ESTAB:
4677 if (cmdlen != F_INSN_SIZE(ipfw_insn))
4678 goto bad_size;
4679 break;
4681 case O_IP_SRC_TABLE:
4682 case O_IP_DST_TABLE:
4683 if (cmdlen != F_INSN_SIZE(ipfw_insn))
4684 goto bad_size;
4685 if (cmd->arg1 >= ipfw_table_max) {
4686 kprintf("ipfw: invalid table id %u, max %d\n",
4687 cmd->arg1, ipfw_table_max);
4688 return EINVAL;
4690 break;
4692 case O_IP_SRC_IFIP:
4693 case O_IP_DST_IFIP:
4694 if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
4695 goto bad_size;
4696 break;
4698 case O_UID:
4699 case O_GID:
4700 case O_IP_SRC:
4701 case O_IP_DST:
4702 case O_TCPSEQ:
4703 case O_TCPACK:
4704 case O_PROB:
4705 case O_ICMPTYPE:
4706 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
4707 goto bad_size;
4708 break;
4710 case O_LIMIT:
4711 if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
4712 goto bad_size;
4713 break;
4715 case O_LOG:
4716 if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
4717 goto bad_size;
4719 ((ipfw_insn_log *)cmd)->log_left =
4720 ((ipfw_insn_log *)cmd)->max_log;
4722 break;
4724 case O_IP_SRC_MASK:
4725 case O_IP_DST_MASK:
4726 if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
4727 goto bad_size;
4728 if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
4729 kprintf("ipfw: opcode %d, useless rule\n",
4730 cmd->opcode);
4731 return EINVAL;
4733 break;
4735 case O_IP_SRC_SET:
4736 case O_IP_DST_SET:
4737 if (cmd->arg1 == 0 || cmd->arg1 > 256) {
4738 kprintf("ipfw: invalid set size %d\n",
4739 cmd->arg1);
4740 return EINVAL;
4742 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
4743 (cmd->arg1+31)/32 )
4744 goto bad_size;
4745 break;
4747 case O_MACADDR2:
4748 if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
4749 goto bad_size;
4750 break;
4752 case O_MAC_TYPE:
4753 case O_IP_SRCPORT:
4754 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
4755 if (cmdlen < 2 || cmdlen > 31)
4756 goto bad_size;
4757 break;
4759 case O_RECV:
4760 case O_XMIT:
4761 case O_VIA:
4762 if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
4763 goto bad_size;
4764 break;
4766 case O_PIPE:
4767 case O_QUEUE:
4768 if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
4769 goto bad_size;
4770 goto check_action;
4772 case O_FORWARD_IP:
4773 if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
4774 goto bad_size;
4775 } else {
4776 in_addr_t fwd_addr;
4778 fwd_addr = ((ipfw_insn_sa *)cmd)->
4779 sa.sin_addr.s_addr;
4780 if (IN_MULTICAST(ntohl(fwd_addr))) {
4781 kprintf("ipfw: try forwarding to "
4782 "multicast address\n");
4783 return EINVAL;
4786 goto check_action;
4788 case O_FORWARD_MAC: /* XXX not implemented yet */
4789 case O_CHECK_STATE:
4790 case O_COUNT:
4791 case O_ACCEPT:
4792 case O_DENY:
4793 case O_REJECT:
4794 case O_SKIPTO:
4795 case O_DIVERT:
4796 case O_TEE:
4797 case O_DEFRAG:
4798 if (cmdlen != F_INSN_SIZE(ipfw_insn))
4799 goto bad_size;
4800 check_action:
4801 if (have_action) {
4802 kprintf("ipfw: opcode %d, multiple actions"
4803 " not allowed\n",
4804 cmd->opcode);
4805 return EINVAL;
4807 have_action = 1;
4808 if (l != cmdlen) {
4809 kprintf("ipfw: opcode %d, action must be"
4810 " last opcode\n",
4811 cmd->opcode);
4812 return EINVAL;
4814 break;
4815 default:
4816 kprintf("ipfw: opcode %d, unknown opcode\n",
4817 cmd->opcode);
4818 return EINVAL;
4821 if (have_action == 0) {
4822 kprintf("ipfw: missing action\n");
4823 return EINVAL;
4825 return 0;
4827 bad_size:
4828 kprintf("ipfw: opcode %d size %d wrong\n",
4829 cmd->opcode, cmdlen);
4830 return EINVAL;
4833 static int
4834 ipfw_ctl_add_rule(struct sockopt *sopt)
4836 struct ipfw_ioc_rule *ioc_rule;
4837 size_t size;
4838 uint32_t rule_flags;
4839 int error;
4841 ASSERT_NETISR0;
4843 size = sopt->sopt_valsize;
4844 if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
4845 size < sizeof(*ioc_rule)) {
4846 return EINVAL;
4848 if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
4849 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
4850 IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
4852 ioc_rule = sopt->sopt_val;
4854 error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
4855 if (error)
4856 return error;
4858 ipfw_add_rule(ioc_rule, rule_flags);
4860 if (sopt->sopt_dir == SOPT_GET)
4861 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
4862 return 0;
4865 static void *
4866 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
4867 struct ipfw_ioc_rule *ioc_rule)
4869 const struct ip_fw *sibling;
4870 #ifdef INVARIANTS
4871 int i;
4872 #endif
4874 ASSERT_NETISR0;
4875 KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
4877 ioc_rule->act_ofs = rule->act_ofs;
4878 ioc_rule->cmd_len = rule->cmd_len;
4879 ioc_rule->rulenum = rule->rulenum;
4880 ioc_rule->set = rule->set;
4881 ioc_rule->usr_flags = rule->usr_flags;
4883 ioc_rule->set_disable = ctx->ipfw_set_disable;
4884 ioc_rule->static_count = static_count;
4885 ioc_rule->static_len = static_ioc_len;
4888 * Visit (read-only) all of the rule's duplications to get
4889 * the necessary statistics
4891 #ifdef INVARIANTS
4892 i = 0;
4893 #endif
4894 ioc_rule->pcnt = 0;
4895 ioc_rule->bcnt = 0;
4896 ioc_rule->timestamp = 0;
4897 for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
4898 ioc_rule->pcnt += sibling->pcnt;
4899 ioc_rule->bcnt += sibling->bcnt;
4900 if (sibling->timestamp > ioc_rule->timestamp)
4901 ioc_rule->timestamp = sibling->timestamp;
4902 #ifdef INVARIANTS
4903 ++i;
4904 #endif
4906 KASSERT(i == netisr_ncpus,
4907 ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
4909 bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
4911 return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
4914 static boolean_t
4915 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
4917 struct ipfw_ioc_flowid *ioc_id;
4919 if (trk->tc_expire == 0) {
4920 /* Not a scanned one. */
4921 return (FALSE);
4924 ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
4925 0 : trk->tc_expire - time_uptime;
4926 ioc_state->pcnt = 0;
4927 ioc_state->bcnt = 0;
4929 ioc_state->dyn_type = O_LIMIT_PARENT;
4930 ioc_state->count = trk->tc_count;
4932 ioc_state->rulenum = trk->tc_rulenum;
4934 ioc_id = &ioc_state->id;
4935 ioc_id->type = ETHERTYPE_IP;
4936 ioc_id->u.ip.proto = trk->tc_proto;
4937 ioc_id->u.ip.src_ip = trk->tc_saddr;
4938 ioc_id->u.ip.dst_ip = trk->tc_daddr;
4939 ioc_id->u.ip.src_port = trk->tc_sport;
4940 ioc_id->u.ip.dst_port = trk->tc_dport;
4942 return (TRUE);
4945 static boolean_t
4946 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
4948 struct ipfw_ioc_flowid *ioc_id;
4950 if (s->st_type == O_ANCHOR)
4951 return (FALSE);
4953 ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
4954 0 : s->st_expire - time_uptime;
4955 ioc_state->pcnt = s->st_pcnt;
4956 ioc_state->bcnt = s->st_bcnt;
4958 ioc_state->dyn_type = s->st_type;
4959 ioc_state->count = 0;
4961 ioc_state->rulenum = s->st_rule->rulenum;
4963 ioc_id = &ioc_state->id;
4964 ioc_id->type = ETHERTYPE_IP;
4965 ioc_id->u.ip.proto = s->st_proto;
4966 ipfw_key_4tuple(&s->st_key,
4967 &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
4968 &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
4970 return (TRUE);
4973 static void
4974 ipfw_state_copy_dispatch(netmsg_t nmsg)
4976 struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
4977 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4978 const struct ipfw_state *s;
4979 const struct ipfw_track *t;
4981 ASSERT_NETISR_NCPUS(mycpuid);
4982 KASSERT(nm->state_cnt < nm->state_cntmax,
4983 ("invalid state count %d, max %d",
4984 nm->state_cnt, nm->state_cntmax));
4986 TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
4987 if (ipfw_state_copy(s, nm->ioc_state)) {
4988 nm->ioc_state++;
4989 nm->state_cnt++;
4990 if (nm->state_cnt == nm->state_cntmax)
4991 goto done;
4996 * Prepare tracks in the global track tree for userland.
4998 TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
4999 struct ipfw_trkcnt *trk;
5001 if (t->t_count == NULL) /* anchor */
5002 continue;
5003 trk = t->t_trkcnt;
5006 * Only one netisr can run this function at
5007 * any time, and only this function accesses
5008 * trkcnt's tc_expire, so this is safe w/o
5009 * ipfw_gd.ipfw_trkcnt_token.
5011 if (trk->tc_expire > t->t_expire)
5012 continue;
5013 trk->tc_expire = t->t_expire;
5017 * Copy tracks in the global track tree to userland in
5018 * the last netisr.
5020 if (mycpuid == netisr_ncpus - 1) {
5021 struct ipfw_trkcnt *trk;
5023 KASSERT(nm->state_cnt < nm->state_cntmax,
5024 ("invalid state count %d, max %d",
5025 nm->state_cnt, nm->state_cntmax));
5027 IPFW_TRKCNT_TOKGET;
5028 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5029 if (ipfw_track_copy(trk, nm->ioc_state)) {
5030 nm->ioc_state++;
5031 nm->state_cnt++;
5032 if (nm->state_cnt == nm->state_cntmax) {
5033 IPFW_TRKCNT_TOKREL;
5034 goto done;
5038 IPFW_TRKCNT_TOKREL;
5040 done:
5041 if (nm->state_cnt == nm->state_cntmax) {
5042 /* No more space; done. */
5043 netisr_replymsg(&nm->base, 0);
5044 } else {
5045 netisr_forwardmsg(&nm->base, mycpuid + 1);
5049 static int
5050 ipfw_ctl_get_rules(struct sockopt *sopt)
5052 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5053 struct ip_fw *rule;
5054 void *bp;
5055 size_t size;
5056 int state_cnt;
5058 ASSERT_NETISR0;
5061 * pass up a copy of the current rules. Static rules
5062 * come first (the last of which has number IPFW_DEFAULT_RULE),
5063 * followed by a possibly empty list of states.
5066 size = static_ioc_len; /* size of static rules */
5069 * Size of the states.
5070 * XXX take tracks as state for userland compat.
5072 state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5073 state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5074 size += state_cnt * sizeof(struct ipfw_ioc_state);
5076 if (sopt->sopt_valsize < size) {
5077 /* short length, no need to return incomplete rules */
5078 /* XXX: if superuser, no need to zero buffer */
5079 bzero(sopt->sopt_val, sopt->sopt_valsize);
5080 return 0;
5082 bp = sopt->sopt_val;
5084 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5085 bp = ipfw_copy_rule(ctx, rule, bp);
5087 if (state_cnt) {
5088 struct netmsg_cpstate nm;
5089 #ifdef INVARIANTS
5090 size_t old_size = size;
5091 #endif
5093 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5094 MSGF_PRIORITY, ipfw_state_copy_dispatch);
5095 nm.ioc_state = bp;
5096 nm.state_cntmax = state_cnt;
5097 nm.state_cnt = 0;
5098 netisr_domsg_global(&nm.base);
5101 * The # of states may be shrinked after the snapshot
5102 * of the state count was taken. To give user a correct
5103 * state count, nm->state_cnt is used to recalculate
5104 * the actual size.
5106 size = static_ioc_len +
5107 (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5108 KKASSERT(size <= old_size);
5111 sopt->sopt_valsize = size;
5112 return 0;
5115 static void
5116 ipfw_set_disable_dispatch(netmsg_t nmsg)
5118 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5120 ASSERT_NETISR_NCPUS(mycpuid);
5122 ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5123 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5126 static void
5127 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5129 struct netmsg_base nmsg;
5130 uint32_t set_disable;
5132 ASSERT_NETISR0;
5134 /* IPFW_DEFAULT_SET is always enabled */
5135 enable |= (1 << IPFW_DEFAULT_SET);
5136 set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5138 bzero(&nmsg, sizeof(nmsg));
5139 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5140 ipfw_set_disable_dispatch);
5141 nmsg.lmsg.u.ms_result32 = set_disable;
5143 netisr_domsg_global(&nmsg);
5146 static void
5147 ipfw_table_create_dispatch(netmsg_t nm)
5149 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5150 int tblid = nm->lmsg.u.ms_result;
5152 ASSERT_NETISR_NCPUS(mycpuid);
5154 if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5155 rn_cpumaskhead(mycpuid), 32))
5156 panic("ipfw: create table%d failed", tblid);
5158 netisr_forwardmsg(&nm->base, mycpuid + 1);
5161 static int
5162 ipfw_table_create(struct sockopt *sopt)
5164 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5165 struct ipfw_ioc_table *tbl;
5166 struct netmsg_base nm;
5168 ASSERT_NETISR0;
5170 if (sopt->sopt_valsize != sizeof(*tbl))
5171 return (EINVAL);
5173 tbl = sopt->sopt_val;
5174 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5175 return (EINVAL);
5177 if (ctx->ipfw_tables[tbl->tableid] != NULL)
5178 return (EEXIST);
5180 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5181 ipfw_table_create_dispatch);
5182 nm.lmsg.u.ms_result = tbl->tableid;
5183 netisr_domsg_global(&nm);
5185 return (0);
5188 static void
5189 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5191 struct radix_node *ret;
5193 ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5194 if (ret != rn)
5195 panic("deleted other table entry");
5196 kfree(ret, M_IPFW);
5199 static int
5200 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5203 ipfw_table_killrn(xrnh, rn);
5204 return (0);
5207 static void
5208 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5209 int destroy)
5211 struct radix_node_head *rnh;
5213 ASSERT_NETISR_NCPUS(mycpuid);
5215 rnh = ctx->ipfw_tables[tableid];
5216 rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5217 if (destroy) {
5218 Free(rnh);
5219 ctx->ipfw_tables[tableid] = NULL;
5223 static void
5224 ipfw_table_flush_dispatch(netmsg_t nmsg)
5226 struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5227 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5229 ASSERT_NETISR_NCPUS(mycpuid);
5231 ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5232 netisr_forwardmsg(&nm->base, mycpuid + 1);
5235 static void
5236 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5238 int i;
5240 ASSERT_NETISR_NCPUS(mycpuid);
5242 for (i = 0; i < ipfw_table_max; ++i) {
5243 if (ctx->ipfw_tables[i] != NULL)
5244 ipfw_table_flush_oncpu(ctx, i, destroy);
5248 static void
5249 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5251 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5253 ASSERT_NETISR_NCPUS(mycpuid);
5255 ipfw_table_flushall_oncpu(ctx, 0);
5256 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5259 static int
5260 ipfw_table_flush(struct sockopt *sopt)
5262 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5263 struct ipfw_ioc_table *tbl;
5264 struct netmsg_tblflush nm;
5266 ASSERT_NETISR0;
5268 if (sopt->sopt_valsize != sizeof(*tbl))
5269 return (EINVAL);
5271 tbl = sopt->sopt_val;
5272 if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5273 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5274 MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5275 netisr_domsg_global(&nm.base);
5276 return (0);
5279 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5280 return (EINVAL);
5282 if (ctx->ipfw_tables[tbl->tableid] == NULL)
5283 return (ENOENT);
5285 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5286 ipfw_table_flush_dispatch);
5287 nm.tableid = tbl->tableid;
5288 nm.destroy = 0;
5289 if (sopt->sopt_name == IP_FW_TBL_DESTROY)
5290 nm.destroy = 1;
5291 netisr_domsg_global(&nm.base);
5293 return (0);
5296 static int
5297 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
5299 int *cnt = xcnt;
5301 (*cnt)++;
5302 return (0);
5305 static int
5306 ipfw_table_cpent(struct radix_node *rn, void *xcp)
5308 struct ipfw_table_cp *cp = xcp;
5309 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5310 struct ipfw_ioc_tblent *ioc_te;
5311 #ifdef INVARIANTS
5312 int cnt;
5313 #endif
5315 KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
5316 cp->te_idx, cp->te_cnt));
5317 ioc_te = &cp->te[cp->te_idx];
5319 if (te->te_nodes->rn_mask != NULL) {
5320 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
5321 *te->te_nodes->rn_mask);
5322 } else {
5323 ioc_te->netmask.sin_len = 0;
5325 memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
5327 ioc_te->use = te->te_use;
5328 ioc_te->last_used = te->te_lastuse;
5329 #ifdef INVARIANTS
5330 cnt = 1;
5331 #endif
5333 while ((te = te->te_sibling) != NULL) {
5334 #ifdef INVARIANTS
5335 ++cnt;
5336 #endif
5337 ioc_te->use += te->te_use;
5338 if (te->te_lastuse > ioc_te->last_used)
5339 ioc_te->last_used = te->te_lastuse;
5341 KASSERT(cnt == netisr_ncpus,
5342 ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
5344 cp->te_idx++;
5346 return (0);
5349 static int
5350 ipfw_table_get(struct sockopt *sopt)
5352 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5353 struct radix_node_head *rnh;
5354 struct ipfw_ioc_table *tbl;
5355 struct ipfw_ioc_tblcont *cont;
5356 struct ipfw_table_cp cp;
5357 int cnt = 0, sz;
5359 ASSERT_NETISR0;
5361 if (sopt->sopt_valsize < sizeof(*tbl))
5362 return (EINVAL);
5364 tbl = sopt->sopt_val;
5365 if (tbl->tableid < 0) {
5366 struct ipfw_ioc_tbllist *list;
5367 int i;
5370 * List available table ids.
5372 for (i = 0; i < ipfw_table_max; ++i) {
5373 if (ctx->ipfw_tables[i] != NULL)
5374 ++cnt;
5377 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
5378 if (sopt->sopt_valsize < sz) {
5379 bzero(sopt->sopt_val, sopt->sopt_valsize);
5380 return (E2BIG);
5382 list = sopt->sopt_val;
5383 list->tablecnt = cnt;
5385 cnt = 0;
5386 for (i = 0; i < ipfw_table_max; ++i) {
5387 if (ctx->ipfw_tables[i] != NULL) {
5388 KASSERT(cnt < list->tablecnt,
5389 ("invalid idx %d, cnt %d",
5390 cnt, list->tablecnt));
5391 list->tables[cnt++] = i;
5394 sopt->sopt_valsize = sz;
5395 return (0);
5396 } else if (tbl->tableid >= ipfw_table_max) {
5397 return (EINVAL);
5400 rnh = ctx->ipfw_tables[tbl->tableid];
5401 if (rnh == NULL)
5402 return (ENOENT);
5403 rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
5405 sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
5406 if (sopt->sopt_valsize < sz) {
5407 bzero(sopt->sopt_val, sopt->sopt_valsize);
5408 return (E2BIG);
5410 cont = sopt->sopt_val;
5411 cont->entcnt = cnt;
5413 cp.te = cont->ent;
5414 cp.te_idx = 0;
5415 cp.te_cnt = cnt;
5416 rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
5418 sopt->sopt_valsize = sz;
5419 return (0);
5422 static void
5423 ipfw_table_add_dispatch(netmsg_t nmsg)
5425 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5426 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5427 struct radix_node_head *rnh;
5428 struct ipfw_tblent *te;
5430 ASSERT_NETISR_NCPUS(mycpuid);
5432 rnh = ctx->ipfw_tables[nm->tableid];
5434 te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
5435 te->te_nodes->rn_key = (char *)&te->te_key;
5436 memcpy(&te->te_key, nm->key, sizeof(te->te_key));
5438 if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
5439 te->te_nodes) == NULL) {
5440 if (mycpuid == 0) {
5441 kfree(te, M_IPFW);
5442 netisr_replymsg(&nm->base, EEXIST);
5443 return;
5445 panic("rnh_addaddr failed");
5448 /* Link siblings. */
5449 if (nm->sibling != NULL)
5450 nm->sibling->te_sibling = te;
5451 nm->sibling = te;
5453 netisr_forwardmsg(&nm->base, mycpuid + 1);
5456 static void
5457 ipfw_table_del_dispatch(netmsg_t nmsg)
5459 struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5460 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5461 struct radix_node_head *rnh;
5462 struct radix_node *rn;
5464 ASSERT_NETISR_NCPUS(mycpuid);
5466 rnh = ctx->ipfw_tables[nm->tableid];
5467 rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
5468 if (rn == NULL) {
5469 if (mycpuid == 0) {
5470 netisr_replymsg(&nm->base, ESRCH);
5471 return;
5473 panic("rnh_deladdr failed");
5475 kfree(rn, M_IPFW);
5477 netisr_forwardmsg(&nm->base, mycpuid + 1);
5480 static int
5481 ipfw_table_alt(struct sockopt *sopt)
5483 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5484 struct ipfw_ioc_tblcont *tbl;
5485 struct ipfw_ioc_tblent *te;
5486 struct sockaddr_in key0;
5487 struct sockaddr *netmask = NULL, *key;
5488 struct netmsg_tblent nm;
5490 ASSERT_NETISR0;
5492 if (sopt->sopt_valsize != sizeof(*tbl))
5493 return (EINVAL);
5494 tbl = sopt->sopt_val;
5496 if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5497 return (EINVAL);
5498 if (tbl->entcnt != 1)
5499 return (EINVAL);
5501 if (ctx->ipfw_tables[tbl->tableid] == NULL)
5502 return (ENOENT);
5503 te = &tbl->ent[0];
5505 if (te->key.sin_family != AF_INET ||
5506 te->key.sin_port != 0 ||
5507 te->key.sin_len != sizeof(struct sockaddr_in))
5508 return (EINVAL);
5509 key = (struct sockaddr *)&te->key;
5511 if (te->netmask.sin_len != 0) {
5512 if (te->netmask.sin_port != 0 ||
5513 te->netmask.sin_len > sizeof(struct sockaddr_in))
5514 return (EINVAL);
5515 netmask = (struct sockaddr *)&te->netmask;
5516 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
5517 key = (struct sockaddr *)&key0;
5520 if (sopt->sopt_name == IP_FW_TBL_ADD) {
5521 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5522 MSGF_PRIORITY, ipfw_table_add_dispatch);
5523 } else {
5524 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5525 MSGF_PRIORITY, ipfw_table_del_dispatch);
5527 nm.key = key;
5528 nm.netmask = netmask;
5529 nm.tableid = tbl->tableid;
5530 nm.sibling = NULL;
5531 return (netisr_domsg_global(&nm.base));
5534 static int
5535 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
5537 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5539 te->te_use = 0;
5540 te->te_lastuse = 0;
5541 return (0);
5544 static void
5545 ipfw_table_zero_dispatch(netmsg_t nmsg)
5547 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5548 struct radix_node_head *rnh;
5550 ASSERT_NETISR_NCPUS(mycpuid);
5552 rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
5553 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5555 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5558 static void
5559 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
5561 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5562 int i;
5564 ASSERT_NETISR_NCPUS(mycpuid);
5566 for (i = 0; i < ipfw_table_max; ++i) {
5567 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5569 if (rnh != NULL)
5570 rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5572 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5575 static int
5576 ipfw_table_zero(struct sockopt *sopt)
5578 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5579 struct netmsg_base nm;
5580 struct ipfw_ioc_table *tbl;
5582 ASSERT_NETISR0;
5584 if (sopt->sopt_valsize != sizeof(*tbl))
5585 return (EINVAL);
5586 tbl = sopt->sopt_val;
5588 if (tbl->tableid < 0) {
5589 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5590 ipfw_table_zeroall_dispatch);
5591 netisr_domsg_global(&nm);
5592 return (0);
5593 } else if (tbl->tableid >= ipfw_table_max) {
5594 return (EINVAL);
5595 } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
5596 return (ENOENT);
5599 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5600 ipfw_table_zero_dispatch);
5601 nm.lmsg.u.ms_result = tbl->tableid;
5602 netisr_domsg_global(&nm);
5604 return (0);
5607 static int
5608 ipfw_table_killexp(struct radix_node *rn, void *xnm)
5610 struct netmsg_tblexp *nm = xnm;
5611 struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5613 if (te->te_expired) {
5614 ipfw_table_killrn(nm->rnh, rn);
5615 nm->expcnt++;
5617 return (0);
5620 static void
5621 ipfw_table_expire_dispatch(netmsg_t nmsg)
5623 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5624 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5625 struct radix_node_head *rnh;
5627 ASSERT_NETISR_NCPUS(mycpuid);
5629 rnh = ctx->ipfw_tables[nm->tableid];
5630 nm->rnh = rnh;
5631 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5633 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5634 ("not all expired addresses (%d) were deleted (%d)",
5635 nm->cnt * (mycpuid + 1), nm->expcnt));
5637 netisr_forwardmsg(&nm->base, mycpuid + 1);
5640 static void
5641 ipfw_table_expireall_dispatch(netmsg_t nmsg)
5643 struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5644 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5645 int i;
5647 ASSERT_NETISR_NCPUS(mycpuid);
5649 for (i = 0; i < ipfw_table_max; ++i) {
5650 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5652 if (rnh == NULL)
5653 continue;
5654 nm->rnh = rnh;
5655 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5658 KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5659 ("not all expired addresses (%d) were deleted (%d)",
5660 nm->cnt * (mycpuid + 1), nm->expcnt));
5662 netisr_forwardmsg(&nm->base, mycpuid + 1);
5665 static int
5666 ipfw_table_markexp(struct radix_node *rn, void *xnm)
5668 struct netmsg_tblexp *nm = xnm;
5669 struct ipfw_tblent *te;
5670 time_t lastuse;
5672 te = (struct ipfw_tblent *)rn;
5673 lastuse = te->te_lastuse;
5675 while ((te = te->te_sibling) != NULL) {
5676 if (te->te_lastuse > lastuse)
5677 lastuse = te->te_lastuse;
5679 if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
5680 /* Not expired */
5681 return (0);
5684 te = (struct ipfw_tblent *)rn;
5685 te->te_expired = 1;
5686 while ((te = te->te_sibling) != NULL)
5687 te->te_expired = 1;
5688 nm->cnt++;
5690 return (0);
5693 static int
5694 ipfw_table_expire(struct sockopt *sopt)
5696 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5697 struct netmsg_tblexp nm;
5698 struct ipfw_ioc_tblexp *tbl;
5699 struct radix_node_head *rnh;
5701 ASSERT_NETISR0;
5703 if (sopt->sopt_valsize != sizeof(*tbl))
5704 return (EINVAL);
5705 tbl = sopt->sopt_val;
5706 tbl->expcnt = 0;
5708 nm.expcnt = 0;
5709 nm.cnt = 0;
5710 nm.expire = tbl->expire;
5712 if (tbl->tableid < 0) {
5713 int i;
5715 for (i = 0; i < ipfw_table_max; ++i) {
5716 rnh = ctx->ipfw_tables[i];
5717 if (rnh == NULL)
5718 continue;
5719 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5721 if (nm.cnt == 0) {
5722 /* No addresses can be expired. */
5723 return (0);
5725 tbl->expcnt = nm.cnt;
5727 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5728 MSGF_PRIORITY, ipfw_table_expireall_dispatch);
5729 nm.tableid = -1;
5730 netisr_domsg_global(&nm.base);
5731 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5732 ("not all expired addresses (%d) were deleted (%d)",
5733 nm.cnt * netisr_ncpus, nm.expcnt));
5735 return (0);
5736 } else if (tbl->tableid >= ipfw_table_max) {
5737 return (EINVAL);
5740 rnh = ctx->ipfw_tables[tbl->tableid];
5741 if (rnh == NULL)
5742 return (ENOENT);
5743 rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5744 if (nm.cnt == 0) {
5745 /* No addresses can be expired. */
5746 return (0);
5748 tbl->expcnt = nm.cnt;
5750 netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5751 ipfw_table_expire_dispatch);
5752 nm.tableid = tbl->tableid;
5753 netisr_domsg_global(&nm.base);
5754 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5755 ("not all expired addresses (%d) were deleted (%d)",
5756 nm.cnt * netisr_ncpus, nm.expcnt));
5757 return (0);
5760 static void
5761 ipfw_crossref_free_dispatch(netmsg_t nmsg)
5763 struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
5765 KKASSERT((rule->rule_flags &
5766 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5767 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5768 ipfw_free_rule(rule);
5770 netisr_replymsg(&nmsg->base, 0);
5773 static void
5774 ipfw_crossref_reap(void)
5776 struct ip_fw *rule, *prev = NULL;
5778 ASSERT_NETISR0;
5780 rule = ipfw_gd.ipfw_crossref_free;
5781 while (rule != NULL) {
5782 uint64_t inflight = 0;
5783 int i;
5785 for (i = 0; i < netisr_ncpus; ++i)
5786 inflight += rule->cross_rules[i]->cross_refs;
5787 if (inflight == 0) {
5788 struct ip_fw *f = rule;
5791 * Unlink.
5793 rule = rule->next;
5794 if (prev != NULL)
5795 prev->next = rule;
5796 else
5797 ipfw_gd.ipfw_crossref_free = rule;
5800 * Free.
5802 for (i = 1; i < netisr_ncpus; ++i) {
5803 struct netmsg_base nm;
5805 netmsg_init(&nm, NULL, &curthread->td_msgport,
5806 MSGF_PRIORITY, ipfw_crossref_free_dispatch);
5807 nm.lmsg.u.ms_resultp = f->cross_rules[i];
5808 netisr_domsg(&nm, i);
5810 KKASSERT((f->rule_flags &
5811 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5812 (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5813 ipfw_unref_rule(f);
5814 } else {
5815 prev = rule;
5816 rule = rule->next;
5820 if (ipfw_gd.ipfw_crossref_free != NULL) {
5821 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
5822 ipfw_crossref_timeo, NULL);
5827 * {set|get}sockopt parser.
5829 static int
5830 ipfw_ctl(struct sockopt *sopt)
5832 int error, rulenum;
5833 uint32_t *masks;
5834 size_t size;
5836 ASSERT_NETISR0;
5838 error = 0;
5840 switch (sopt->sopt_name) {
5841 case IP_FW_GET:
5842 error = ipfw_ctl_get_rules(sopt);
5843 break;
5845 case IP_FW_FLUSH:
5846 ipfw_flush(0 /* keep default rule */);
5847 break;
5849 case IP_FW_ADD:
5850 error = ipfw_ctl_add_rule(sopt);
5851 break;
5853 case IP_FW_DEL:
5855 * IP_FW_DEL is used for deleting single rules or sets,
5856 * and (ab)used to atomically manipulate sets.
5857 * Argument size is used to distinguish between the two:
5858 * sizeof(uint32_t)
5859 * delete single rule or set of rules,
5860 * or reassign rules (or sets) to a different set.
5861 * 2 * sizeof(uint32_t)
5862 * atomic disable/enable sets.
5863 * first uint32_t contains sets to be disabled,
5864 * second uint32_t contains sets to be enabled.
5866 masks = sopt->sopt_val;
5867 size = sopt->sopt_valsize;
5868 if (size == sizeof(*masks)) {
5870 * Delete or reassign static rule
5872 error = ipfw_ctl_alter(masks[0]);
5873 } else if (size == (2 * sizeof(*masks))) {
5875 * Set enable/disable
5877 ipfw_ctl_set_disable(masks[0], masks[1]);
5878 } else {
5879 error = EINVAL;
5881 break;
5883 case IP_FW_ZERO:
5884 case IP_FW_RESETLOG: /* argument is an int, the rule number */
5885 rulenum = 0;
5887 if (sopt->sopt_val != 0) {
5888 error = soopt_to_kbuf(sopt, &rulenum,
5889 sizeof(int), sizeof(int));
5890 if (error)
5891 break;
5893 error = ipfw_ctl_zero_entry(rulenum,
5894 sopt->sopt_name == IP_FW_RESETLOG);
5895 break;
5897 case IP_FW_TBL_CREATE:
5898 error = ipfw_table_create(sopt);
5899 break;
5901 case IP_FW_TBL_ADD:
5902 case IP_FW_TBL_DEL:
5903 error = ipfw_table_alt(sopt);
5904 break;
5906 case IP_FW_TBL_FLUSH:
5907 case IP_FW_TBL_DESTROY:
5908 error = ipfw_table_flush(sopt);
5909 break;
5911 case IP_FW_TBL_GET:
5912 error = ipfw_table_get(sopt);
5913 break;
5915 case IP_FW_TBL_ZERO:
5916 error = ipfw_table_zero(sopt);
5917 break;
5919 case IP_FW_TBL_EXPIRE:
5920 error = ipfw_table_expire(sopt);
5921 break;
5923 default:
5924 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
5925 error = EINVAL;
5928 ipfw_crossref_reap();
5929 return error;
5932 static void
5933 ipfw_keepalive_done(struct ipfw_context *ctx)
5936 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5937 ("keepalive is not in progress"));
5938 ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
5939 callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
5940 ipfw_keepalive, NULL);
5943 static void
5944 ipfw_keepalive_more(struct ipfw_context *ctx)
5946 struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
5948 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5949 ("keepalive is not in progress"));
5950 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
5951 ("keepalive more did not finish"));
5952 netisr_sendmsg_oncpu(nm);
5955 static void
5956 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
5958 struct ipfw_state *s;
5959 int scanned = 0, expired = 0, kept = 0;
5961 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5962 ("keepalive is not in progress"));
5964 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
5965 uint32_t ack_rev, ack_fwd;
5966 struct ipfw_flow_id id;
5968 if (scanned++ >= ipfw_state_scan_max) {
5969 ipfw_keepalive_more(ctx);
5970 return;
5973 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5974 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
5976 if (s->st_type == O_ANCHOR)
5977 continue;
5979 if (TIME_LEQ(s->st_expire, time_uptime)) {
5980 /* State expired. */
5981 ipfw_state_del(ctx, s);
5982 if (++expired >= ipfw_state_expire_max) {
5983 ipfw_keepalive_more(ctx);
5984 return;
5986 continue;
5990 * Keep alive processing
5993 if (s->st_proto != IPPROTO_TCP)
5994 continue;
5995 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
5996 continue;
5997 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
5998 s->st_expire))
5999 continue; /* too early */
6001 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6002 &id.dst_ip, &id.dst_port);
6003 ack_rev = s->st_ack_rev;
6004 ack_fwd = s->st_ack_fwd;
6006 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6007 send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6009 if (++kept >= ipfw_keepalive_max) {
6010 ipfw_keepalive_more(ctx);
6011 return;
6014 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6015 ipfw_keepalive_done(ctx);
6018 static void
6019 ipfw_keepalive_more_dispatch(netmsg_t nm)
6021 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6022 struct ipfw_state *anchor;
6024 ASSERT_NETISR_NCPUS(mycpuid);
6025 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6026 ("keepalive is not in progress"));
6028 /* Reply ASAP */
6029 netisr_replymsg(&nm->base, 0);
6031 anchor = &ctx->ipfw_keepalive_anch;
6032 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6033 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6034 ipfw_keepalive_done(ctx);
6035 return;
6037 ipfw_keepalive_loop(ctx, anchor);
6041 * This procedure is only used to handle keepalives. It is invoked
6042 * every dyn_keepalive_period
6044 static void
6045 ipfw_keepalive_dispatch(netmsg_t nm)
6047 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6048 struct ipfw_state *anchor;
6050 ASSERT_NETISR_NCPUS(mycpuid);
6051 KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6052 ("keepalive is in progress"));
6053 ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6055 /* Reply ASAP */
6056 crit_enter();
6057 netisr_replymsg(&nm->base, 0);
6058 crit_exit();
6060 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6061 ipfw_keepalive_done(ctx);
6062 return;
6065 anchor = &ctx->ipfw_keepalive_anch;
6066 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6067 ipfw_keepalive_loop(ctx, anchor);
6071 * This procedure is only used to handle keepalives. It is invoked
6072 * every dyn_keepalive_period
6074 static void
6075 ipfw_keepalive(void *dummy __unused)
6077 struct netmsg_base *msg;
6079 KKASSERT(mycpuid < netisr_ncpus);
6080 msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6082 crit_enter();
6083 if (msg->lmsg.ms_flags & MSGF_DONE)
6084 netisr_sendmsg_oncpu(msg);
6085 crit_exit();
6088 static void
6089 ipfw_ip_input_dispatch(netmsg_t nmsg)
6091 struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6092 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6093 struct mbuf *m = nm->m;
6094 struct ip_fw *rule = nm->arg1;
6096 ASSERT_NETISR_NCPUS(mycpuid);
6097 KASSERT(rule->cpuid == mycpuid,
6098 ("rule does not belong to cpu%d", mycpuid));
6099 KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6100 ("mbuf does not have ipfw continue rule"));
6102 KASSERT(ctx->ipfw_cont_rule == NULL,
6103 ("pending ipfw continue rule"));
6104 ctx->ipfw_cont_rule = rule;
6105 ip_input(m);
6108 * This rule is no longer used; decrement its cross_refs,
6109 * so this rule can be deleted.
6111 rule->cross_refs--;
6113 /* May not be cleared, if ipfw was unload/disabled. */
6114 ctx->ipfw_cont_rule = NULL;
6117 static int
6118 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6120 struct ip_fw_args args;
6121 struct mbuf *m = *m0;
6122 struct m_tag *mtag;
6123 int tee = 0, error = 0, ret, cpuid;
6124 struct netmsg_genpkt *nm;
6126 args.cont = 0;
6127 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6128 /* Extract info from dummynet tag */
6129 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6130 KKASSERT(mtag != NULL);
6131 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6132 KKASSERT(args.rule != NULL);
6134 m_tag_delete(m, mtag);
6135 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6136 } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6137 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6139 KKASSERT(ctx->ipfw_cont_rule != NULL);
6140 args.rule = ctx->ipfw_cont_rule;
6141 ctx->ipfw_cont_rule = NULL;
6143 args.cont = 1;
6144 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6145 } else {
6146 args.rule = NULL;
6149 args.eh = NULL;
6150 args.oif = NULL;
6151 args.m = m;
6152 ret = ipfw_chk(&args);
6153 m = args.m;
6155 if (m == NULL) {
6156 error = EACCES;
6157 goto back;
6160 switch (ret) {
6161 case IP_FW_PASS:
6162 break;
6164 case IP_FW_DENY:
6165 m_freem(m);
6166 m = NULL;
6167 error = EACCES;
6168 break;
6170 case IP_FW_DUMMYNET:
6171 /* Send packet to the appropriate pipe */
6172 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6173 break;
6175 case IP_FW_TEE:
6176 tee = 1;
6177 /* FALL THROUGH */
6179 case IP_FW_DIVERT:
6181 * Must clear bridge tag when changing
6183 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6184 if (ip_divert_p != NULL) {
6185 m = ip_divert_p(m, tee, 1);
6186 } else {
6187 m_freem(m);
6188 m = NULL;
6189 /* not sure this is the right error msg */
6190 error = EACCES;
6192 break;
6194 case IP_FW_CONTINUE:
6195 KASSERT(m->m_flags & M_HASH, ("no hash"));
6196 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
6197 KASSERT(cpuid != mycpuid,
6198 ("continue on the same cpu%d", cpuid));
6201 * NOTE:
6202 * Bump cross_refs to prevent this rule and its siblings
6203 * from being deleted, while this mbuf is inflight. The
6204 * cross_refs of the sibling rule on the target cpu will
6205 * be decremented, once this mbuf is going to be filtered
6206 * on the target cpu.
6208 args.rule->cross_refs++;
6209 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6211 nm = &m->m_hdr.mh_genmsg;
6212 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6213 ipfw_ip_input_dispatch);
6214 nm->m = m;
6215 nm->arg1 = args.rule->cross_rules[cpuid];
6216 netisr_sendmsg(&nm->base, cpuid);
6218 /* This mbuf is dispatched; no longer valid. */
6219 m = NULL;
6220 break;
6222 default:
6223 panic("unknown ipfw return value: %d", ret);
6225 back:
6226 *m0 = m;
6227 return error;
6230 static int
6231 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6233 struct ip_fw_args args;
6234 struct mbuf *m = *m0;
6235 struct m_tag *mtag;
6236 int tee = 0, error = 0, ret;
6238 args.cont = 0;
6239 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6240 /* Extract info from dummynet tag */
6241 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6242 KKASSERT(mtag != NULL);
6243 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6244 KKASSERT(args.rule != NULL);
6246 m_tag_delete(m, mtag);
6247 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6248 } else {
6249 args.rule = NULL;
6252 args.eh = NULL;
6253 args.m = m;
6254 args.oif = ifp;
6255 ret = ipfw_chk(&args);
6256 m = args.m;
6258 if (m == NULL) {
6259 error = EACCES;
6260 goto back;
6263 switch (ret) {
6264 case IP_FW_PASS:
6265 break;
6267 case IP_FW_DENY:
6268 m_freem(m);
6269 m = NULL;
6270 error = EACCES;
6271 break;
6273 case IP_FW_DUMMYNET:
6274 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
6275 break;
6277 case IP_FW_TEE:
6278 tee = 1;
6279 /* FALL THROUGH */
6281 case IP_FW_DIVERT:
6282 if (ip_divert_p != NULL) {
6283 m = ip_divert_p(m, tee, 0);
6284 } else {
6285 m_freem(m);
6286 m = NULL;
6287 /* not sure this is the right error msg */
6288 error = EACCES;
6290 break;
6292 default:
6293 panic("unknown ipfw return value: %d", ret);
6295 back:
6296 *m0 = m;
6297 return error;
6300 static void
6301 ipfw_hook(void)
6303 struct pfil_head *pfh;
6305 ASSERT_NETISR0;
6307 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6308 if (pfh == NULL)
6309 return;
6311 pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6312 pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6315 static void
6316 ipfw_dehook(void)
6318 struct pfil_head *pfh;
6320 ASSERT_NETISR0;
6322 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6323 if (pfh == NULL)
6324 return;
6326 pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6327 pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6330 static int
6331 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
6333 int dyn_cnt;
6335 dyn_cnt = ipfw_state_cntcoll();
6336 dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
6338 return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
6341 static int
6342 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
6344 int state_cnt;
6346 state_cnt = ipfw_state_cntcoll();
6347 return (sysctl_handle_int(oidp, &state_cnt, 0, req));
6350 static int
6351 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
6353 int state_max, error;
6355 state_max = ipfw_state_max;
6356 error = sysctl_handle_int(oidp, &state_max, 0, req);
6357 if (error || req->newptr == NULL)
6358 return (error);
6360 if (state_max < 1)
6361 return (EINVAL);
6363 ipfw_state_max_set(state_max);
6364 return (0);
6367 static int
6368 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
6370 int dyn_max, error;
6372 dyn_max = ipfw_state_max + ipfw_track_max;
6374 error = sysctl_handle_int(oidp, &dyn_max, 0, req);
6375 if (error || req->newptr == NULL)
6376 return (error);
6378 if (dyn_max < 2)
6379 return (EINVAL);
6381 ipfw_state_max_set(dyn_max / 2);
6382 ipfw_track_max = dyn_max / 2;
6383 return (0);
6386 static void
6387 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
6389 int enable = nmsg->lmsg.u.ms_result;
6391 ASSERT_NETISR0;
6393 if (fw_enable == enable)
6394 goto reply;
6396 fw_enable = enable;
6397 if (fw_enable)
6398 ipfw_hook();
6399 else
6400 ipfw_dehook();
6401 reply:
6402 netisr_replymsg(&nmsg->base, 0);
6405 static int
6406 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
6408 struct netmsg_base nmsg;
6409 int enable, error;
6411 enable = fw_enable;
6412 error = sysctl_handle_int(oidp, &enable, 0, req);
6413 if (error || req->newptr == NULL)
6414 return error;
6416 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6417 ipfw_sysctl_enable_dispatch);
6418 nmsg.lmsg.u.ms_result = enable;
6420 return netisr_domsg(&nmsg, 0);
6423 static int
6424 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
6426 return sysctl_int_range(oidp, arg1, arg2, req,
6427 IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
6430 static int
6431 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
6434 return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
6437 static int
6438 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
6440 u_long stat = 0;
6441 int cpu, error;
6443 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6444 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
6446 error = sysctl_handle_long(oidp, &stat, 0, req);
6447 if (error || req->newptr == NULL)
6448 return (error);
6450 /* Zero out this stat. */
6451 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6452 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
6453 return (0);
6456 static void
6457 ipfw_ctx_init_dispatch(netmsg_t nmsg)
6459 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
6460 struct ipfw_context *ctx;
6461 struct ip_fw *def_rule;
6463 ASSERT_NETISR_NCPUS(mycpuid);
6465 ctx = kmalloc(__offsetof(struct ipfw_context,
6466 ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
6468 RB_INIT(&ctx->ipfw_state_tree);
6469 TAILQ_INIT(&ctx->ipfw_state_list);
6471 RB_INIT(&ctx->ipfw_track_tree);
6472 TAILQ_INIT(&ctx->ipfw_track_list);
6474 callout_init_mp(&ctx->ipfw_stateto_ch);
6475 netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
6476 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
6477 ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
6478 netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
6479 MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
6481 callout_init_mp(&ctx->ipfw_trackto_ch);
6482 netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
6483 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
6484 netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
6485 MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
6487 callout_init_mp(&ctx->ipfw_keepalive_ch);
6488 netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
6489 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
6490 ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
6491 netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
6492 MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
6494 ipfw_ctx[mycpuid] = ctx;
6496 def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
6498 def_rule->act_ofs = 0;
6499 def_rule->rulenum = IPFW_DEFAULT_RULE;
6500 def_rule->cmd_len = 1;
6501 def_rule->set = IPFW_DEFAULT_SET;
6503 def_rule->cmd[0].len = 1;
6504 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
6505 def_rule->cmd[0].opcode = O_ACCEPT;
6506 #else
6507 if (filters_default_to_accept)
6508 def_rule->cmd[0].opcode = O_ACCEPT;
6509 else
6510 def_rule->cmd[0].opcode = O_DENY;
6511 #endif
6513 def_rule->refcnt = 1;
6514 def_rule->cpuid = mycpuid;
6516 /* Install the default rule */
6517 ctx->ipfw_default_rule = def_rule;
6518 ctx->ipfw_layer3_chain = def_rule;
6520 /* Link rule CPU sibling */
6521 ipfw_link_sibling(fwmsg, def_rule);
6523 /* Statistics only need to be updated once */
6524 if (mycpuid == 0)
6525 ipfw_inc_static_count(def_rule);
6527 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6530 static void
6531 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
6534 crit_enter();
6535 /* Reply ASAP */
6536 netisr_replymsg(&nmsg->base, 0);
6537 crit_exit();
6538 ipfw_crossref_reap();
6541 static void
6542 ipfw_crossref_timeo(void *dummy __unused)
6544 struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
6546 KKASSERT(mycpuid == 0);
6548 crit_enter();
6549 if (msg->lmsg.ms_flags & MSGF_DONE)
6550 netisr_sendmsg_oncpu(msg);
6551 crit_exit();
6554 static void
6555 ipfw_ifaddr_dispatch(netmsg_t nmsg)
6557 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6558 struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
6559 struct ip_fw *f;
6561 ASSERT_NETISR_NCPUS(mycpuid);
6563 for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
6564 int l, cmdlen;
6565 ipfw_insn *cmd;
6567 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
6568 continue;
6570 for (l = f->cmd_len, cmd = f->cmd; l > 0;
6571 l -= cmdlen, cmd += cmdlen) {
6572 cmdlen = F_LEN(cmd);
6573 if (cmd->opcode == O_IP_SRC_IFIP ||
6574 cmd->opcode == O_IP_DST_IFIP) {
6575 if (strncmp(ifp->if_xname,
6576 ((ipfw_insn_ifip *)cmd)->ifname,
6577 IFNAMSIZ) == 0)
6578 cmd->arg1 &= ~IPFW_IFIP_VALID;
6582 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6585 static void
6586 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
6587 enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
6589 struct netmsg_base nm;
6591 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6592 ipfw_ifaddr_dispatch);
6593 nm.lmsg.u.ms_resultp = ifp;
6594 netisr_domsg_global(&nm);
6597 static void
6598 ipfw_init_dispatch(netmsg_t nmsg)
6600 struct netmsg_ipfw fwmsg;
6601 int error = 0, cpu;
6603 ASSERT_NETISR0;
6605 if (IPFW_LOADED) {
6606 kprintf("IP firewall already loaded\n");
6607 error = EEXIST;
6608 goto reply;
6611 if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
6612 ipfw_table_max = UINT16_MAX;
6614 /* Initialize global track tree. */
6615 RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
6616 IPFW_TRKCNT_TOKINIT;
6618 /* GC for freed crossref rules. */
6619 callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
6620 netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
6621 MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
6623 ipfw_state_max_set(ipfw_state_max);
6624 ipfw_state_headroom = 8 * netisr_ncpus;
6626 bzero(&fwmsg, sizeof(fwmsg));
6627 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6628 ipfw_ctx_init_dispatch);
6629 netisr_domsg_global(&fwmsg.base);
6631 ip_fw_chk_ptr = ipfw_chk;
6632 ip_fw_ctl_ptr = ipfw_ctl;
6633 ip_fw_dn_io_ptr = ipfw_dummynet_io;
6635 kprintf("ipfw2 initialized, default to %s, logging ",
6636 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
6637 O_ACCEPT ? "accept" : "deny");
6639 #ifdef IPFIREWALL_VERBOSE
6640 fw_verbose = 1;
6641 #endif
6642 #ifdef IPFIREWALL_VERBOSE_LIMIT
6643 verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
6644 #endif
6645 if (fw_verbose == 0) {
6646 kprintf("disabled\n");
6647 } else if (verbose_limit == 0) {
6648 kprintf("unlimited\n");
6649 } else {
6650 kprintf("limited to %d packets/entry by default\n",
6651 verbose_limit);
6654 ip_fw_loaded = 1;
6655 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
6656 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
6657 ipfw_state_expire_ipifunc, NULL, cpu);
6658 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
6659 ipfw_track_expire_ipifunc, NULL, cpu);
6660 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
6661 ipfw_keepalive, NULL, cpu);
6664 if (fw_enable)
6665 ipfw_hook();
6667 ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
6668 NULL, EVENTHANDLER_PRI_ANY);
6669 if (ipfw_ifaddr_event == NULL)
6670 kprintf("ipfw: ifaddr_event register failed\n");
6672 reply:
6673 netisr_replymsg(&nmsg->base, error);
6676 static int
6677 ipfw_init(void)
6679 struct netmsg_base smsg;
6681 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6682 ipfw_init_dispatch);
6683 return netisr_domsg(&smsg, 0);
6686 #ifdef KLD_MODULE
6688 static void
6689 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
6691 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6693 ASSERT_NETISR_NCPUS(mycpuid);
6695 callout_stop_sync(&ctx->ipfw_stateto_ch);
6696 callout_stop_sync(&ctx->ipfw_trackto_ch);
6697 callout_stop_sync(&ctx->ipfw_keepalive_ch);
6699 crit_enter();
6700 netisr_dropmsg(&ctx->ipfw_stateexp_more);
6701 netisr_dropmsg(&ctx->ipfw_stateexp_nm);
6702 netisr_dropmsg(&ctx->ipfw_trackexp_more);
6703 netisr_dropmsg(&ctx->ipfw_trackexp_nm);
6704 netisr_dropmsg(&ctx->ipfw_keepalive_more);
6705 netisr_dropmsg(&ctx->ipfw_keepalive_nm);
6706 crit_exit();
6708 ipfw_table_flushall_oncpu(ctx, 1);
6710 netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6713 static void
6714 ipfw_fini_dispatch(netmsg_t nmsg)
6716 struct netmsg_base nm;
6717 int error = 0, cpu;
6719 ASSERT_NETISR0;
6721 ipfw_crossref_reap();
6723 if (ipfw_gd.ipfw_refcnt != 0) {
6724 error = EBUSY;
6725 goto reply;
6728 ip_fw_loaded = 0;
6729 ipfw_dehook();
6731 /* Synchronize any inflight state/track expire IPIs. */
6732 lwkt_synchronize_ipiqs("ipfwfini");
6734 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6735 ipfw_ctx_fini_dispatch);
6736 netisr_domsg_global(&nm);
6738 callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
6739 crit_enter();
6740 netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
6741 crit_exit();
6743 if (ipfw_ifaddr_event != NULL)
6744 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
6746 ip_fw_chk_ptr = NULL;
6747 ip_fw_ctl_ptr = NULL;
6748 ip_fw_dn_io_ptr = NULL;
6749 ipfw_flush(1 /* kill default rule */);
6751 /* Free pre-cpu context */
6752 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6753 kfree(ipfw_ctx[cpu], M_IPFW);
6755 kprintf("IP firewall unloaded\n");
6756 reply:
6757 netisr_replymsg(&nmsg->base, error);
6760 static int
6761 ipfw_fini(void)
6763 struct netmsg_base smsg;
6765 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6766 ipfw_fini_dispatch);
6767 return netisr_domsg(&smsg, 0);
6770 #endif /* KLD_MODULE */
6772 static int
6773 ipfw_modevent(module_t mod, int type, void *unused)
6775 int err = 0;
6777 switch (type) {
6778 case MOD_LOAD:
6779 err = ipfw_init();
6780 break;
6782 case MOD_UNLOAD:
6783 #ifndef KLD_MODULE
6784 kprintf("ipfw statically compiled, cannot unload\n");
6785 err = EBUSY;
6786 #else
6787 err = ipfw_fini();
6788 #endif
6789 break;
6790 default:
6791 break;
6793 return err;
6796 static moduledata_t ipfwmod = {
6797 "ipfw",
6798 ipfw_modevent,
6801 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
6802 MODULE_VERSION(ipfw, 1);