sys/net/ipfw/ip_fw2.c

   1 /*
   2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
  26  */
  27
  28 /*
  29  * Implement IP packet firewall (new version)
  30  */
  31
  32 #include "opt_ipfw.h"
  33 #include "opt_inet.h"
  34 #ifndef INET
  35 #error IPFIREWALL requires INET.
  36 #endif /* INET */
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/malloc.h>
  41 #include <sys/mbuf.h>
  42 #include <sys/kernel.h>
  43 #include <sys/proc.h>
  44 #include <sys/socket.h>
  45 #include <sys/socketvar.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/ucred.h>
  49 #include <sys/in_cksum.h>
  50 #include <sys/limits.h>
  51 #include <sys/lock.h>
  52 #include <sys/tree.h>
  53
  54 #include <net/if.h>
  55 #include <net/route.h>
  56 #include <net/pfil.h>
  57 #include <net/dummynet/ip_dummynet.h>
  58
  59 #include <sys/thread2.h>
  60 #include <sys/mplock2.h>
  61 #include <net/netmsg2.h>
  62
  63 #include <netinet/in.h>
  64 #include <netinet/in_systm.h>
  65 #include <netinet/in_var.h>
  66 #include <netinet/in_pcb.h>
  67 #include <netinet/ip.h>
  68 #include <netinet/ip_var.h>
  69 #include <netinet/ip_icmp.h>
  70 #include <netinet/tcp.h>
  71 #include <netinet/tcp_seq.h>
  72 #include <netinet/tcp_timer.h>
  73 #include <netinet/tcp_var.h>
  74 #include <netinet/tcpip.h>
  75 #include <netinet/udp.h>
  76 #include <netinet/udp_var.h>
  77 #include <netinet/ip_divert.h>
  78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
  79
  80 #include <net/ipfw/ip_fw2.h>
  81
  82 #ifdef IPFIREWALL_DEBUG
  83 #define DPRINTF(fmt, ...) \
  84 do { \
  85         if (fw_debug > 0) \
  86                 kprintf(fmt, __VA_ARGS__); \
  87 } while (0)
  88 #else
  89 #define DPRINTF(fmt, ...)       ((void)0)
  90 #endif
  91
  92 /*
  93  * Description about per-CPU rule duplication:
  94  *
  95  * Module loading/unloading and all ioctl operations are serialized
  96  * by netisr0, so we don't have any ordering or locking problems.
  97  *
  98  * Following graph shows how operation on per-CPU rule list is
  99  * performed [2 CPU case]:
 100  *
 101  *   CPU0                 CPU1
 102  *
 103  * netisr0 <------------------------------------+
 104  *  domsg                                       |
 105  *    :                                         |
 106  *    :(delete/add...)                          |
 107  *    :                                         |
 108  *    :         netmsg                          | netmsg
 109  *  forwardmsg---------->netisr1                |
 110  *                          :                   |
 111  *                          :(delete/add...)    |
 112  *                          :                   |
 113  *                          :                   |
 114  *                        replymsg--------------+
 115  *
 116  *
 117  *
 118  * Rule structure [2 CPU case]
 119  *
 120  *    CPU0               CPU1
 121  *
 122  * layer3_chain       layer3_chain
 123  *     |                  |
 124  *     V                  V
 125  * +-------+ sibling  +-------+ sibling
 126  * | rule1 |--------->| rule1 |--------->NULL
 127  * +-------+          +-------+
 128  *     |                  |
 129  *     |next              |next
 130  *     V                  V
 131  * +-------+ sibling  +-------+ sibling
 132  * | rule2 |--------->| rule2 |--------->NULL
 133  * +-------+          +-------+
 134  *
 135  * ip_fw.sibling:
 136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
 137  *    iterate layer3_chain in netisr0; the current rule's duplication
 138  *    to the other CPUs could safely be read-only accessed through
 139  *    ip_fw.sibling.
 140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
 141  *    a) In netisr0 rule3 is determined to be inserted between rule1
 142  *       and rule2.  To make this decision we need to iterate the
 143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
 144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
 145  *       in netisr0 as next_rule.
 146  *    b) After the insertion in netisr0 is done, we will move on to
 147  *       netisr1.  But instead of relocating the rule3's position in
 148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
 149  *       netmsg's prev_rule to rule1->sibling and next_rule to
 150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
 151  *       netisr0.
 152  */
 153
 154 /*
 155  * Description of states and tracks.
 156  *
 157  * Both states and tracks are stored in per-cpu RB trees instead of
 158  * per-cpu hash tables to avoid the worst case hash degeneration.
 159  *
 160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
 161  * measured in seconds and depending on the flags.
 162  *
 163  * When a packet is received, its address fields are first masked with
 164  * the mask defined for the rule, then matched against the entries in
 165  * the per-cpu state RB tree.  States are generated by 'keep-state'
 166  * and 'limit' options.
 167  *
 168  * The max number of states is ipfw_state_max.  When we reach the
 169  * maximum number of states we do not create anymore.  This is done to
 170  * avoid consuming too much memory, but also too much time when
 171  * searching on each packet.
 172  *
 173  * Each state holds a pointer to the parent ipfw rule of the current
 174  * CPU so we know what action to perform.  States are removed when the
 175  * parent rule is deleted.  XXX we should make them survive.
 176  *
 177  * There are some limitations with states -- we do not obey the
 178  * 'randomized match', and we do not do multiple passes through the
 179  * firewall.  XXX check the latter!!!
 180  *
 181  * States grow independently on each CPU, e.g. 2 CPU case:
 182  *
 183  *        CPU0                     CPU1
 184  * ...................      ...................
 185  * :  state RB tree  :      :  state RB tree  :
 186  * :                 :      :                 :
 187  * : state1   state2 :      :      state3     :
 188  * :     |    |      :      :        |        :
 189  * :.....|....|......:      :........|........:
 190  *       |    |                      |
 191  *       |    |                      |st_rule
 192  *       |    |                      |
 193  *       V    V                      V
 194  *     +-------+                 +-------+
 195  *     | rule1 |                 | rule1 |
 196  *     +-------+                 +-------+
 197  *
 198  * Tracks are used to enforce limits on the number of sessions.  Tracks
 199  * are generated by 'limit' option.
 200  *
 201  * The max number of tracks is ipfw_track_max.  When we reach the
 202  * maximum number of tracks we do not create anymore.  This is done to
 203  * avoid consuming too much memory.
 204  *
 205  * Tracks are organized into two layers, track counter RB tree is
 206  * shared between CPUs, track RB tree is per-cpu.  States generated by
 207  * 'limit' option are linked to the track in addition to the per-cpu
 208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
 209  *
 210  *             ..............................
 211  *             :    track counter RB tree   :
 212  *             :                            :
 213  *             :        +-----------+       :
 214  *             :        |  trkcnt1  |       :
 215  *             :        |           |       :
 216  *             :      +--->counter<----+    :
 217  *             :      | |           |  |    :
 218  *             :      | +-----------+  |    :
 219  *             :......|................|....:
 220  *                    |                |
 221  *        CPU0        |                |         CPU1
 222  * .................  |t_count         |  .................
 223  * : track RB tree :  |                |  : track RB tree :
 224  * :               :  |                |  :               :
 225  * : +-->track1-------+                +--------track2    :
 226  * : |     A       :                      :               :
 227  * : |     |       :                      :               :
 228  * :.|.....|.......:                      :...............:
 229  *   |     +----------------+
 230  *   | .................... |
 231  *   | :   state RB tree  : |st_track
 232  *   | :                  : |
 233  *   +---state1    state2---+
 234  *     :     |       |    :
 235  *     :.....|.......|....:
 236  *           |       |
 237  *           |       |st_rule
 238  *           V       V
 239  *         +----------+
 240  *         |   rule1  |
 241  *         +----------+
 242  */
 243
 244 #define IPFW_AUTOINC_STEP_MIN   1
 245 #define IPFW_AUTOINC_STEP_MAX   1000
 246 #define IPFW_AUTOINC_STEP_DEF   100
 247
 248 #define IPFW_TABLE_MAX_DEF      64
 249
 250 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
 251 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
 252
 253 #define MATCH_REVERSE           0
 254 #define MATCH_FORWARD           1
 255 #define MATCH_NONE              2
 256 #define MATCH_UNKNOWN           3
 257
 258 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
 259 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
 260                                  (IPFW_STATE_TCPFLAGS << 8))
 261
 262 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
 263 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
 264 #define BOTH_RST                (TH_RST | (TH_RST << 8))
 265 /* TH_ACK here means FIN was ACKed. */
 266 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
 267
 268 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
 269                                  (((s)->st_state & BOTH_RST) ||         \
 270                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
 271
 272 #define O_ANCHOR                O_NOP
 273
 274 struct netmsg_ipfw {
 275         struct netmsg_base      base;
 276         const struct ipfw_ioc_rule *ioc_rule;
 277         struct ip_fw            *next_rule;
 278         struct ip_fw            *prev_rule;
 279         struct ip_fw            *sibling;
 280         uint32_t                rule_flags;
 281         struct ip_fw            **cross_rules;
 282 };
 283
 284 struct netmsg_del {
 285         struct netmsg_base      base;
 286         struct ip_fw            *start_rule;
 287         struct ip_fw            *prev_rule;
 288         uint16_t                rulenum;
 289         uint8_t                 from_set;
 290         uint8_t                 to_set;
 291 };
 292
 293 struct netmsg_zent {
 294         struct netmsg_base      base;
 295         struct ip_fw            *start_rule;
 296         uint16_t                rulenum;
 297         uint16_t                log_only;
 298 };
 299
 300 struct netmsg_cpstate {
 301         struct netmsg_base      base;
 302         struct ipfw_ioc_state   *ioc_state;
 303         int                     state_cntmax;
 304         int                     state_cnt;
 305 };
 306
 307 struct netmsg_tblent {
 308         struct netmsg_base      base;
 309         struct sockaddr         *key;
 310         struct sockaddr         *netmask;
 311         struct ipfw_tblent      *sibling;
 312         int                     tableid;
 313 };
 314
 315 struct netmsg_tblflush {
 316         struct netmsg_base      base;
 317         int                     tableid;
 318         int                     destroy;
 319 };
 320
 321 struct netmsg_tblexp {
 322         struct netmsg_base      base;
 323         time_t                  expire;
 324         int                     tableid;
 325         int                     cnt;
 326         int                     expcnt;
 327         struct radix_node_head  *rnh;
 328 };
 329
 330 struct ipfw_table_cp {
 331         struct ipfw_ioc_tblent  *te;
 332         int                     te_idx;
 333         int                     te_cnt;
 334 };
 335
 336 struct ip_fw_local {
 337         /*
 338          * offset       The offset of a fragment. offset != 0 means that
 339          *      we have a fragment at this offset of an IPv4 packet.
 340          *      offset == 0 means that (if this is an IPv4 packet)
 341          *      this is the first or only fragment.
 342          */
 343         u_short                 offset;
 344
 345         /*
 346          * Local copies of addresses. They are only valid if we have
 347          * an IP packet.
 348          *
 349          * proto        The protocol. Set to 0 for non-ip packets,
 350          *      or to the protocol read from the packet otherwise.
 351          *      proto != 0 means that we have an IPv4 packet.
 352          *
 353          * src_port, dst_port   port numbers, in HOST format. Only
 354          *      valid for TCP and UDP packets.
 355          *
 356          * src_ip, dst_ip       ip addresses, in NETWORK format.
 357          *      Only valid for IPv4 packets.
 358          */
 359         uint8_t                 proto;
 360         uint16_t                src_port;       /* NOTE: host format    */
 361         uint16_t                dst_port;       /* NOTE: host format    */
 362         struct in_addr          src_ip;         /* NOTE: network format */
 363         struct in_addr          dst_ip;         /* NOTE: network format */
 364         uint16_t                ip_len;
 365 };
 366
 367 struct ipfw_addrs {
 368         uint32_t                addr1;
 369         uint32_t                addr2;
 370 };
 371
 372 struct ipfw_ports {
 373         uint16_t                port1;
 374         uint16_t                port2;
 375 };
 376
 377 struct ipfw_key {
 378         union {
 379                 struct ipfw_addrs addrs;
 380                 uint64_t        value;
 381         } addr_u;
 382         union {
 383                 struct ipfw_ports ports;
 384                 uint32_t        value;
 385         } port_u;
 386         uint8_t                 proto;
 387         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
 388         uint16_t                rsvd2;
 389 };
 390
 391 #define IPFW_KEY_SWAP_ADDRS     0x1
 392 #define IPFW_KEY_SWAP_PORTS     0x2
 393 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
 394
 395 struct ipfw_trkcnt {
 396         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
 397         struct ipfw_key         tc_key;
 398         uintptr_t               tc_ruleid;
 399         int                     tc_refs;
 400         int                     tc_count;
 401         time_t                  tc_expire;      /* userland get-only */
 402         uint16_t                tc_rulenum;     /* userland get-only */
 403 } __cachealign;
 404
 405 #define tc_addrs                tc_key.addr_u.value
 406 #define tc_ports                tc_key.port_u.value
 407 #define tc_proto                tc_key.proto
 408 #define tc_saddr                tc_key.addr_u.addrs.addr1
 409 #define tc_daddr                tc_key.addr_u.addrs.addr2
 410 #define tc_sport                tc_key.port_u.ports.port1
 411 #define tc_dport                tc_key.port_u.ports.port2
 412
 413 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
 414
 415 struct ipfw_state;
 416
 417 struct ipfw_track {
 418         RB_ENTRY(ipfw_track)    t_rblink;
 419         struct ipfw_key         t_key;
 420         struct ip_fw            *t_rule;
 421         time_t                  t_lastexp;
 422         LIST_HEAD(, ipfw_state) t_state_list;
 423         time_t                  t_expire;
 424         volatile int            *t_count;
 425         struct ipfw_trkcnt      *t_trkcnt;
 426         TAILQ_ENTRY(ipfw_track) t_link;
 427 };
 428
 429 #define t_addrs                 t_key.addr_u.value
 430 #define t_ports                 t_key.port_u.value
 431 #define t_proto                 t_key.proto
 432 #define t_saddr                 t_key.addr_u.addrs.addr1
 433 #define t_daddr                 t_key.addr_u.addrs.addr2
 434 #define t_sport                 t_key.port_u.ports.port1
 435 #define t_dport                 t_key.port_u.ports.port2
 436
 437 RB_HEAD(ipfw_track_tree, ipfw_track);
 438 TAILQ_HEAD(ipfw_track_list, ipfw_track);
 439
 440 struct ipfw_state {
 441         RB_ENTRY(ipfw_state)    st_rblink;
 442         struct ipfw_key         st_key;
 443
 444         time_t                  st_expire;      /* expire time */
 445         struct ip_fw            *st_rule;
 446
 447         uint64_t                st_pcnt;        /* packets */
 448         uint64_t                st_bcnt;        /* bytes */
 449
 450         /*
 451          * st_state:
 452          * State of this rule, typically a combination of TCP flags.
 453          *
 454          * st_ack_fwd/st_ack_rev:
 455          * Most recent ACKs in forward and reverse direction.  They
 456          * are used to generate keepalives.
 457          */
 458         uint32_t                st_state;
 459         uint32_t                st_ack_fwd;
 460         uint32_t                st_seq_fwd;
 461         uint32_t                st_ack_rev;
 462         uint32_t                st_seq_rev;
 463
 464         uint16_t                st_flags;       /* IPFW_STATE_F_ */
 465         uint16_t                st_type;        /* O_KEEP_STATE/O_LIMIT */
 466         struct ipfw_track       *st_track;
 467
 468         LIST_ENTRY(ipfw_state)  st_trklink;
 469         TAILQ_ENTRY(ipfw_state) st_link;
 470 };
 471
 472 #define st_addrs                st_key.addr_u.value
 473 #define st_ports                st_key.port_u.value
 474 #define st_proto                st_key.proto
 475 #define st_swap                 st_key.swap
 476
 477 #define IPFW_STATE_F_ACKFWD     0x0001
 478 #define IPFW_STATE_F_SEQFWD     0x0002
 479 #define IPFW_STATE_F_ACKREV     0x0004
 480 #define IPFW_STATE_F_SEQREV     0x0008
 481
 482 TAILQ_HEAD(ipfw_state_list, ipfw_state);
 483 RB_HEAD(ipfw_state_tree, ipfw_state);
 484
 485 struct ipfw_tblent {
 486         struct radix_node       te_nodes[2];
 487         struct sockaddr_in      te_key;
 488         u_long                  te_use;
 489         time_t                  te_lastuse;
 490         struct ipfw_tblent      *te_sibling;
 491         volatile int            te_expired;
 492 };
 493
 494 struct ipfw_context {
 495         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
 496         struct ip_fw            *ipfw_default_rule;     /* default rule */
 497         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
 498
 499         /*
 500          * ipfw_set_disable contains one bit per set value (0..31).
 501          * If the bit is set, all rules with the corresponding set
 502          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
 503          * default rule and CANNOT be disabled.
 504          */
 505         uint32_t                ipfw_set_disable;
 506
 507         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
 508
 509         struct ip_fw            *ipfw_cont_rule;
 510
 511         struct ipfw_state_tree  ipfw_state_tree;
 512         struct ipfw_state_list  ipfw_state_list;
 513         int                     ipfw_state_loosecnt;
 514         int                     ipfw_state_cnt;
 515
 516         union {
 517                 struct ipfw_state state;
 518                 struct ipfw_track track;
 519                 struct ipfw_trkcnt trkcnt;
 520         } ipfw_tmpkey;
 521
 522         struct ipfw_track_tree  ipfw_track_tree;
 523         struct ipfw_track_list  ipfw_track_list;
 524         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
 525
 526         struct callout          ipfw_stateto_ch;
 527         time_t                  ipfw_state_lastexp;
 528         struct netmsg_base      ipfw_stateexp_nm;
 529         struct netmsg_base      ipfw_stateexp_more;
 530         struct ipfw_state       ipfw_stateexp_anch;
 531
 532         struct callout          ipfw_trackto_ch;
 533         time_t                  ipfw_track_lastexp;
 534         struct netmsg_base      ipfw_trackexp_nm;
 535         struct netmsg_base      ipfw_trackexp_more;
 536         struct ipfw_track       ipfw_trackexp_anch;
 537
 538         struct callout          ipfw_keepalive_ch;
 539         struct netmsg_base      ipfw_keepalive_nm;
 540         struct netmsg_base      ipfw_keepalive_more;
 541         struct ipfw_state       ipfw_keepalive_anch;
 542
 543         /*
 544          * Statistics
 545          */
 546         u_long                  ipfw_sts_reap;
 547         u_long                  ipfw_sts_reapfailed;
 548         u_long                  ipfw_sts_overflow;
 549         u_long                  ipfw_sts_nomem;
 550         u_long                  ipfw_sts_tcprecycled;
 551
 552         u_long                  ipfw_tks_nomem;
 553         u_long                  ipfw_tks_reap;
 554         u_long                  ipfw_tks_reapfailed;
 555         u_long                  ipfw_tks_overflow;
 556         u_long                  ipfw_tks_cntnomem;
 557
 558         u_long                  ipfw_frags;
 559         u_long                  ipfw_defraged;
 560         u_long                  ipfw_defrag_remote;
 561
 562         /* Last field */
 563         struct radix_node_head  *ipfw_tables[];
 564 };
 565
 566 #define IPFW_FLAG_KEEPALIVE     0x01
 567 #define IPFW_FLAG_STATEEXP      0x02
 568 #define IPFW_FLAG_TRACKEXP      0x04
 569 #define IPFW_FLAG_STATEREAP     0x08
 570 #define IPFW_FLAG_TRACKREAP     0x10
 571
 572 #define ipfw_state_tmpkey       ipfw_tmpkey.state
 573 #define ipfw_track_tmpkey       ipfw_tmpkey.track
 574 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
 575
 576 struct ipfw_global {
 577         int                     ipfw_state_loosecnt;    /* cache aligned */
 578         time_t                  ipfw_state_globexp __cachealign;
 579
 580         struct lwkt_token       ipfw_trkcnt_token __cachealign;
 581         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
 582         int                     ipfw_trkcnt_cnt;
 583         time_t                  ipfw_track_globexp;
 584
 585         /* Accessed in netisr0. */
 586         struct ip_fw            *ipfw_crossref_free __cachealign;
 587         struct callout          ipfw_crossref_ch;
 588         struct netmsg_base      ipfw_crossref_nm;
 589
 590 #ifdef KLD_MODULE
 591         /*
 592          * Module can not be unloaded, if there are references to
 593          * certains rules of ipfw(4), e.g. dummynet(4)
 594          */
 595         int                     ipfw_refcnt __cachealign;
 596 #endif
 597 } __cachealign;
 598
 599 static struct ipfw_context      *ipfw_ctx[MAXCPU];
 600
 601 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 602
 603 /*
 604  * Following two global variables are accessed and updated only
 605  * in netisr0.
 606  */
 607 static uint32_t static_count;   /* # of static rules */
 608 static uint32_t static_ioc_len; /* bytes of static rules */
 609
 610 /*
 611  * If 1, then ipfw static rules are being flushed,
 612  * ipfw_chk() will skip to the default rule.
 613  */
 614 static int ipfw_flushing;
 615
 616 static int fw_verbose;
 617 static int verbose_limit;
 618
 619 static int fw_debug;
 620 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
 621
 622 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
 623
 624 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
 625 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
 626
 627 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
 628
 629 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 630 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
 631     "Firewall statistics");
 632
 633 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
 634     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
 635 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
 636     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
 637     "Rule number autincrement step");
 638 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
 639     &fw_one_pass, 0,
 640     "Only do a single pass through ipfw when using dummynet(4)");
 641 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
 642     &fw_debug, 0, "Enable printing of debug ip_fw statements");
 643 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
 644     &fw_verbose, 0, "Log matches to ipfw rules");
 645 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
 646     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
 647 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
 648     &ipfw_table_max, 0, "Max # of tables");
 649
 650 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
 651 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
 652 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
 653 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
 654 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
 655 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
 656
 657 /*
 658  * Timeouts for various events in handing states.
 659  *
 660  * NOTE:
 661  * 1 == 0~1 second.
 662  * 2 == 1~2 second(s).
 663  *
 664  * We use 2 seconds for FIN lifetime, so that the states will not be
 665  * ripped prematurely.
 666  */
 667 static uint32_t dyn_ack_lifetime = 300;
 668 static uint32_t dyn_syn_lifetime = 20;
 669 static uint32_t dyn_finwait_lifetime = 20;
 670 static uint32_t dyn_fin_lifetime = 2;
 671 static uint32_t dyn_rst_lifetime = 2;
 672 static uint32_t dyn_udp_lifetime = 10;
 673 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
 674
 675 /*
 676  * Keepalives are sent if dyn_keepalive is set. They are sent every
 677  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
 678  * seconds of lifetime of a rule.
 679  */
 680 static uint32_t dyn_keepalive_interval = 20;
 681 static uint32_t dyn_keepalive_period = 5;
 682 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
 683
 684 static struct ipfw_global       ipfw_gd;
 685 static int      ipfw_state_loosecnt_updthr;
 686 static int      ipfw_state_max = 4096;  /* max # of states */
 687 static int      ipfw_track_max = 4096;  /* max # of tracks */
 688
 689 static int      ipfw_state_headroom;    /* setup at module load time */
 690 static int      ipfw_state_reap_min = 8;
 691 static int      ipfw_state_expire_max = 32;
 692 static int      ipfw_state_scan_max = 256;
 693 static int      ipfw_keepalive_max = 8;
 694 static int      ipfw_track_reap_max = 4;
 695 static int      ipfw_track_expire_max = 16;
 696 static int      ipfw_track_scan_max = 128;
 697
 698 static eventhandler_tag ipfw_ifaddr_event;
 699
 700 /* Compat */
 701 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
 702     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
 703     "Number of states and tracks");
 704 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
 705     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
 706     "Max number of states and tracks");
 707
 708 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
 709     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
 710     "Number of states");
 711 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
 712     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
 713     "Max number of states");
 714 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
 715     &ipfw_state_headroom, 0, "headroom for state reap");
 716 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
 717     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
 718 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
 719     &ipfw_track_max, 0, "Max number of tracks");
 720 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
 721     &static_count, 0, "Number of static rules");
 722 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
 723     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
 724 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
 725     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
 726 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
 727     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
 728 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
 729     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
 730 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
 731     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
 732 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
 733     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
 734 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
 735     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
 736 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
 737     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
 738 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
 739     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
 740     "I", "# of states to scan for each expire iteration");
 741 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
 742     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
 743     "I", "# of states to expire for each expire iteration");
 744 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
 745     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
 746     "I", "# of states to expire for each expire iteration");
 747 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
 748     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
 749     "I", "# of states to reap for state shortage");
 750 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
 751     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
 752     "I", "# of tracks to scan for each expire iteration");
 753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
 754     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
 755     "I", "# of tracks to expire for each expire iteration");
 756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
 757     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
 758     "I", "# of tracks to reap for track shortage");
 759
 760 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
 761     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 762     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
 763     "LU", "# of state reaps due to states shortage");
 764 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
 765     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 766     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
 767     "LU", "# of state reap failure");
 768 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
 769     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 770     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
 771     "LU", "# of state overflow");
 772 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
 773     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 774     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
 775     "LU", "# of state allocation failure");
 776 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
 777     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 778     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
 779     "LU", "# of state deleted due to fast TCP port recycling");
 780
 781 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
 782     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 783     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
 784     "LU", "# of track allocation failure");
 785 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
 786     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 787     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
 788     "LU", "# of track reap due to tracks shortage");
 789 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
 790     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 791     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
 792     "LU", "# of track reap failure");
 793 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
 794     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 795     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
 796     "LU", "# of track overflow");
 797 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
 798     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 799     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
 800     "LU", "# of track counter allocation failure");
 801 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
 802     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 803     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
 804     "LU", "# of IP fragements defraged");
 805 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
 806     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 807     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
 808     "LU", "# of IP packets after defrag");
 809 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
 810     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 811     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
 812     "LU", "# of IP packets after defrag dispatched to remote cpus");
 813
 814 static int              ipfw_state_cmp(struct ipfw_state *,
 815                             struct ipfw_state *);
 816 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
 817                             struct ipfw_trkcnt *);
 818 static int              ipfw_track_cmp(struct ipfw_track *,
 819                             struct ipfw_track *);
 820
 821 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
 822 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
 823
 824 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
 825 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
 826
 827 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
 828 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
 829
 830 static ip_fw_chk_t      ipfw_chk;
 831 static void             ipfw_track_expire_ipifunc(void *);
 832 static void             ipfw_state_expire_ipifunc(void *);
 833 static void             ipfw_keepalive(void *);
 834 static int              ipfw_state_expire_start(struct ipfw_context *,
 835                             int, int);
 836 static void             ipfw_crossref_timeo(void *);
 837
 838 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
 839 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
 840 #define IPFW_TRKCNT_TOKINIT     \
 841         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
 842
 843 static void
 844 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
 845     const struct sockaddr *netmask)
 846 {
 847         const u_char *cp1 = (const u_char *)src;
 848         u_char *cp2 = (u_char *)dst;
 849         const u_char *cp3 = (const u_char *)netmask;
 850         u_char *cplim = cp2 + *cp3;
 851         u_char *cplim2 = cp2 + *cp1;
 852
 853         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 854         cp3 += 2;
 855         if (cplim > cplim2)
 856                 cplim = cplim2;
 857         while (cp2 < cplim)
 858                 *cp2++ = *cp1++ & *cp3++;
 859         if (cp2 < cplim2)
 860                 bzero(cp2, cplim2 - cp2);
 861 }
 862
 863 static __inline void
 864 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
 865     in_addr_t daddr, uint16_t dport, uint8_t proto)
 866 {
 867
 868         key->proto = proto;
 869         key->swap = 0;
 870
 871         if (saddr < daddr) {
 872                 key->addr_u.addrs.addr1 = daddr;
 873                 key->addr_u.addrs.addr2 = saddr;
 874                 key->swap |= IPFW_KEY_SWAP_ADDRS;
 875         } else {
 876                 key->addr_u.addrs.addr1 = saddr;
 877                 key->addr_u.addrs.addr2 = daddr;
 878         }
 879
 880         if (sport < dport) {
 881                 key->port_u.ports.port1 = dport;
 882                 key->port_u.ports.port2 = sport;
 883                 key->swap |= IPFW_KEY_SWAP_PORTS;
 884         } else {
 885                 key->port_u.ports.port1 = sport;
 886                 key->port_u.ports.port2 = dport;
 887         }
 888
 889         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
 890                 key->swap |= IPFW_KEY_SWAP_PORTS;
 891         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
 892                 key->swap |= IPFW_KEY_SWAP_ADDRS;
 893 }
 894
 895 static __inline void
 896 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
 897     in_addr_t *daddr, uint16_t *dport)
 898 {
 899
 900         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
 901                 *saddr = key->addr_u.addrs.addr2;
 902                 *daddr = key->addr_u.addrs.addr1;
 903         } else {
 904                 *saddr = key->addr_u.addrs.addr1;
 905                 *daddr = key->addr_u.addrs.addr2;
 906         }
 907
 908         if (key->swap & IPFW_KEY_SWAP_PORTS) {
 909                 *sport = key->port_u.ports.port2;
 910                 *dport = key->port_u.ports.port1;
 911         } else {
 912                 *sport = key->port_u.ports.port1;
 913                 *dport = key->port_u.ports.port2;
 914         }
 915 }
 916
 917 static int
 918 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
 919 {
 920
 921         if (s1->st_proto > s2->st_proto)
 922                 return (1);
 923         if (s1->st_proto < s2->st_proto)
 924                 return (-1);
 925
 926         if (s1->st_addrs > s2->st_addrs)
 927                 return (1);
 928         if (s1->st_addrs < s2->st_addrs)
 929                 return (-1);
 930
 931         if (s1->st_ports > s2->st_ports)
 932                 return (1);
 933         if (s1->st_ports < s2->st_ports)
 934                 return (-1);
 935
 936         if (s1->st_swap == s2->st_swap ||
 937             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
 938                 return (0);
 939
 940         if (s1->st_swap > s2->st_swap)
 941                 return (1);
 942         else
 943                 return (-1);
 944 }
 945
 946 static int
 947 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
 948 {
 949
 950         if (t1->tc_proto > t2->tc_proto)
 951                 return (1);
 952         if (t1->tc_proto < t2->tc_proto)
 953                 return (-1);
 954
 955         if (t1->tc_addrs > t2->tc_addrs)
 956                 return (1);
 957         if (t1->tc_addrs < t2->tc_addrs)
 958                 return (-1);
 959
 960         if (t1->tc_ports > t2->tc_ports)
 961                 return (1);
 962         if (t1->tc_ports < t2->tc_ports)
 963                 return (-1);
 964
 965         if (t1->tc_ruleid > t2->tc_ruleid)
 966                 return (1);
 967         if (t1->tc_ruleid < t2->tc_ruleid)
 968                 return (-1);
 969
 970         return (0);
 971 }
 972
 973 static int
 974 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
 975 {
 976
 977         if (t1->t_proto > t2->t_proto)
 978                 return (1);
 979         if (t1->t_proto < t2->t_proto)
 980                 return (-1);
 981
 982         if (t1->t_addrs > t2->t_addrs)
 983                 return (1);
 984         if (t1->t_addrs < t2->t_addrs)
 985                 return (-1);
 986
 987         if (t1->t_ports > t2->t_ports)
 988                 return (1);
 989         if (t1->t_ports < t2->t_ports)
 990                 return (-1);
 991
 992         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
 993                 return (1);
 994         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
 995                 return (-1);
 996
 997         return (0);
 998 }
 999
1000 static void
1001 ipfw_state_max_set(int state_max)
1002 {
1003
1004         ipfw_state_max = state_max;
1005         /* Allow 5% states over-allocation. */
1006         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1007 }
1008
1009 static __inline int
1010 ipfw_state_cntcoll(void)
1011 {
1012         int cpu, state_cnt = 0;
1013
1014         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1015                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1016         return (state_cnt);
1017 }
1018
1019 static __inline int
1020 ipfw_state_cntsync(void)
1021 {
1022         int state_cnt;
1023
1024         state_cnt = ipfw_state_cntcoll();
1025         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1026         return (state_cnt);
1027 }
1028
1029 static __inline int
1030 ipfw_free_rule(struct ip_fw *rule)
1031 {
1032         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1033         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1034         rule->refcnt--;
1035         if (rule->refcnt == 0) {
1036                 if (rule->cross_rules != NULL)
1037                         kfree(rule->cross_rules, M_IPFW);
1038                 kfree(rule, M_IPFW);
1039                 return 1;
1040         }
1041         return 0;
1042 }
1043
1044 static void
1045 ipfw_unref_rule(void *priv)
1046 {
1047         ipfw_free_rule(priv);
1048 #ifdef KLD_MODULE
1049         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1050             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1051         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1052 #endif
1053 }
1054
1055 static __inline void
1056 ipfw_ref_rule(struct ip_fw *rule)
1057 {
1058         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1059 #ifdef KLD_MODULE
1060         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1061 #endif
1062         rule->refcnt++;
1063 }
1064
1065 /*
1066  * This macro maps an ip pointer into a layer3 header pointer of type T
1067  */
1068 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1069
1070 static __inline int
1071 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1072 {
1073         int type = L3HDR(struct icmp,ip)->icmp_type;
1074
1075         return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
1076 }
1077
1078 #define TT      ((1 << ICMP_ECHO) | \
1079                  (1 << ICMP_ROUTERSOLICIT) | \
1080                  (1 << ICMP_TSTAMP) | \
1081                  (1 << ICMP_IREQ) | \
1082                  (1 << ICMP_MASKREQ))
1083
1084 static int
1085 is_icmp_query(struct ip *ip)
1086 {
1087         int type = L3HDR(struct icmp, ip)->icmp_type;
1088
1089         return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
1090 }
1091
1092 #undef TT
1093
1094 /*
1095  * The following checks use two arrays of 8 or 16 bits to store the
1096  * bits that we want set or clear, respectively. They are in the
1097  * low and high half of cmd->arg1 or cmd->d[0].
1098  *
1099  * We scan options and store the bits we find set. We succeed if
1100  *
1101  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1102  *
1103  * The code is sometimes optimized not to store additional variables.
1104  */
1105 static int
1106 flags_match(ipfw_insn *cmd, uint8_t bits)
1107 {
1108         u_char want_clear;
1109         bits = ~bits;
1110
1111         if (((cmd->arg1 & 0xff) & bits) != 0)
1112                 return 0; /* some bits we want set were clear */
1113
1114         want_clear = (cmd->arg1 >> 8) & 0xff;
1115         if ((want_clear & bits) != want_clear)
1116                 return 0; /* some bits we want clear were set */
1117         return 1;
1118 }
1119
1120 static int
1121 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1122 {
1123         int optlen, bits = 0;
1124         u_char *cp = (u_char *)(ip + 1);
1125         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1126
1127         for (; x > 0; x -= optlen, cp += optlen) {
1128                 int opt = cp[IPOPT_OPTVAL];
1129
1130                 if (opt == IPOPT_EOL)
1131                         break;
1132
1133                 if (opt == IPOPT_NOP) {
1134                         optlen = 1;
1135                 } else {
1136                         optlen = cp[IPOPT_OLEN];
1137                         if (optlen <= 0 || optlen > x)
1138                                 return 0; /* invalid or truncated */
1139                 }
1140
1141                 switch (opt) {
1142                 case IPOPT_LSRR:
1143                         bits |= IP_FW_IPOPT_LSRR;
1144                         break;
1145
1146                 case IPOPT_SSRR:
1147                         bits |= IP_FW_IPOPT_SSRR;
1148                         break;
1149
1150                 case IPOPT_RR:
1151                         bits |= IP_FW_IPOPT_RR;
1152                         break;
1153
1154                 case IPOPT_TS:
1155                         bits |= IP_FW_IPOPT_TS;
1156                         break;
1157
1158                 default:
1159                         break;
1160                 }
1161         }
1162         return (flags_match(cmd, bits));
1163 }
1164
1165 static int
1166 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1167 {
1168         int optlen, bits = 0;
1169         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1170         u_char *cp = (u_char *)(tcp + 1);
1171         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1172
1173         for (; x > 0; x -= optlen, cp += optlen) {
1174                 int opt = cp[0];
1175
1176                 if (opt == TCPOPT_EOL)
1177                         break;
1178
1179                 if (opt == TCPOPT_NOP) {
1180                         optlen = 1;
1181                 } else {
1182                         optlen = cp[1];
1183                         if (optlen <= 0)
1184                                 break;
1185                 }
1186
1187                 switch (opt) {
1188                 case TCPOPT_MAXSEG:
1189                         bits |= IP_FW_TCPOPT_MSS;
1190                         break;
1191
1192                 case TCPOPT_WINDOW:
1193                         bits |= IP_FW_TCPOPT_WINDOW;
1194                         break;
1195
1196                 case TCPOPT_SACK_PERMITTED:
1197                 case TCPOPT_SACK:
1198                         bits |= IP_FW_TCPOPT_SACK;
1199                         break;
1200
1201                 case TCPOPT_TIMESTAMP:
1202                         bits |= IP_FW_TCPOPT_TS;
1203                         break;
1204
1205                 case TCPOPT_CC:
1206                 case TCPOPT_CCNEW:
1207                 case TCPOPT_CCECHO:
1208                         bits |= IP_FW_TCPOPT_CC;
1209                         break;
1210
1211                 default:
1212                         break;
1213                 }
1214         }
1215         return (flags_match(cmd, bits));
1216 }
1217
1218 static int
1219 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1220 {
1221         if (ifp == NULL)        /* no iface with this packet, match fails */
1222                 return 0;
1223
1224         /* Check by name or by IP address */
1225         if (cmd->name[0] != '\0') { /* match by name */
1226                 /* Check name */
1227                 if (cmd->p.glob) {
1228                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1229                                 return(1);
1230                 } else {
1231                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1232                                 return(1);
1233                 }
1234         } else {
1235                 struct ifaddr_container *ifac;
1236
1237                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1238                         struct ifaddr *ia = ifac->ifa;
1239
1240                         if (ia->ifa_addr == NULL)
1241                                 continue;
1242                         if (ia->ifa_addr->sa_family != AF_INET)
1243                                 continue;
1244                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1245                             (ia->ifa_addr))->sin_addr.s_addr)
1246                                 return(1);      /* match */
1247                 }
1248         }
1249         return(0);      /* no match, fail ... */
1250 }
1251
1252 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1253
1254 /*
1255  * We enter here when we have a rule with O_LOG.
1256  * XXX this function alone takes about 2Kbytes of code!
1257  */
1258 static void
1259 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1260     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1261 {
1262         char *action;
1263         int limit_reached = 0;
1264         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1265
1266         fragment[0] = '\0';
1267         proto[0] = '\0';
1268
1269         if (f == NULL) {        /* bogus pkt */
1270                 if (verbose_limit != 0 &&
1271                     ctx->ipfw_norule_counter >= verbose_limit)
1272                         return;
1273                 ctx->ipfw_norule_counter++;
1274                 if (ctx->ipfw_norule_counter == verbose_limit)
1275                         limit_reached = verbose_limit;
1276                 action = "Refuse";
1277         } else {        /* O_LOG is the first action, find the real one */
1278                 ipfw_insn *cmd = ACTION_PTR(f);
1279                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1280
1281                 if (l->max_log != 0 && l->log_left == 0)
1282                         return;
1283                 l->log_left--;
1284                 if (l->log_left == 0)
1285                         limit_reached = l->max_log;
1286                 cmd += F_LEN(cmd);      /* point to first action */
1287                 if (cmd->opcode == O_PROB)
1288                         cmd += F_LEN(cmd);
1289
1290                 action = action2;
1291                 switch (cmd->opcode) {
1292                 case O_DENY:
1293                         action = "Deny";
1294                         break;
1295
1296                 case O_REJECT:
1297                         if (cmd->arg1==ICMP_REJECT_RST) {
1298                                 action = "Reset";
1299                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1300                                 action = "Reject";
1301                         } else {
1302                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1303                                           cmd->arg1);
1304                         }
1305                         break;
1306
1307                 case O_ACCEPT:
1308                         action = "Accept";
1309                         break;
1310
1311                 case O_COUNT:
1312                         action = "Count";
1313                         break;
1314
1315                 case O_DIVERT:
1316                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1317                         break;
1318
1319                 case O_TEE:
1320                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1321                         break;
1322
1323                 case O_SKIPTO:
1324                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1325                         break;
1326
1327                 case O_PIPE:
1328                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1329                         break;
1330
1331                 case O_QUEUE:
1332                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1333                         break;
1334
1335                 case O_FORWARD_IP:
1336                         {
1337                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1338                                 int len;
1339
1340                                 len = ksnprintf(SNPARGS(action2, 0),
1341                                     "Forward to %s",
1342                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1343                                 if (sa->sa.sin_port) {
1344                                         ksnprintf(SNPARGS(action2, len), ":%d",
1345                                                   sa->sa.sin_port);
1346                                 }
1347                         }
1348                         break;
1349
1350                 default:
1351                         action = "UNKNOWN";
1352                         break;
1353                 }
1354         }
1355
1356         if (hlen == 0) {        /* non-ip */
1357                 ksnprintf(SNPARGS(proto, 0), "MAC");
1358         } else {
1359                 struct ip *ip = mtod(m, struct ip *);
1360                 /* these three are all aliases to the same thing */
1361                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1362                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1363                 struct udphdr *const udp = (struct udphdr *)icmp;
1364
1365                 int ip_off, offset, ip_len;
1366                 int len;
1367
1368                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1369                         ip_off = ntohs(ip->ip_off);
1370                         ip_len = ntohs(ip->ip_len);
1371                 } else {
1372                         ip_off = ip->ip_off;
1373                         ip_len = ip->ip_len;
1374                 }
1375                 offset = ip_off & IP_OFFMASK;
1376                 switch (ip->ip_p) {
1377                 case IPPROTO_TCP:
1378                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1379                                         kinet_ntoa(ip->ip_src, abuf));
1380                         if (offset == 0) {
1381                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1382                                           ntohs(tcp->th_sport),
1383                                           kinet_ntoa(ip->ip_dst, abuf),
1384                                           ntohs(tcp->th_dport));
1385                         } else {
1386                                 ksnprintf(SNPARGS(proto, len), " %s",
1387                                           kinet_ntoa(ip->ip_dst, abuf));
1388                         }
1389                         break;
1390
1391                 case IPPROTO_UDP:
1392                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1393                                         kinet_ntoa(ip->ip_src, abuf));
1394                         if (offset == 0) {
1395                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1396                                           ntohs(udp->uh_sport),
1397                                           kinet_ntoa(ip->ip_dst, abuf),
1398                                           ntohs(udp->uh_dport));
1399                         } else {
1400                                 ksnprintf(SNPARGS(proto, len), " %s",
1401                                           kinet_ntoa(ip->ip_dst, abuf));
1402                         }
1403                         break;
1404
1405                 case IPPROTO_ICMP:
1406                         if (offset == 0) {
1407                                 len = ksnprintf(SNPARGS(proto, 0),
1408                                                 "ICMP:%u.%u ",
1409                                                 icmp->icmp_type,
1410                                                 icmp->icmp_code);
1411                         } else {
1412                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1413                         }
1414                         len += ksnprintf(SNPARGS(proto, len), "%s",
1415                                          kinet_ntoa(ip->ip_src, abuf));
1416                         ksnprintf(SNPARGS(proto, len), " %s",
1417                                   kinet_ntoa(ip->ip_dst, abuf));
1418                         break;
1419
1420                 default:
1421                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1422                                         kinet_ntoa(ip->ip_src, abuf));
1423                         ksnprintf(SNPARGS(proto, len), " %s",
1424                                   kinet_ntoa(ip->ip_dst, abuf));
1425                         break;
1426                 }
1427
1428                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1429                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1430                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1431                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1432                 }
1433         }
1434
1435         if (oif || m->m_pkthdr.rcvif) {
1436                 log(LOG_SECURITY | LOG_INFO,
1437                     "ipfw: %d %s %s %s via %s%s\n",
1438                     f ? f->rulenum : -1,
1439                     action, proto, oif ? "out" : "in",
1440                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1441                     fragment);
1442         } else {
1443                 log(LOG_SECURITY | LOG_INFO,
1444                     "ipfw: %d %s %s [no if info]%s\n",
1445                     f ? f->rulenum : -1,
1446                     action, proto, fragment);
1447         }
1448
1449         if (limit_reached) {
1450                 log(LOG_SECURITY | LOG_NOTICE,
1451                     "ipfw: limit %d reached on entry %d\n",
1452                     limit_reached, f ? f->rulenum : -1);
1453         }
1454 }
1455
1456 #undef SNPARGS
1457
1458 #define TIME_LEQ(a, b)  ((a) - (b) <= 0)
1459
1460 static void
1461 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1462 {
1463
1464         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT,
1465             ("invalid state type %u", s->st_type));
1466         KASSERT(ctx->ipfw_state_cnt > 0,
1467             ("invalid state count %d", ctx->ipfw_state_cnt));
1468
1469         if (s->st_track != NULL) {
1470                 struct ipfw_track *t = s->st_track;
1471
1472                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1473                     ("track state list is empty"));
1474                 LIST_REMOVE(s, st_trklink);
1475
1476                 KASSERT(*t->t_count > 0,
1477                     ("invalid track count %d", *t->t_count));
1478                 atomic_subtract_int(t->t_count, 1);
1479         }
1480
1481         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1482         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1483         kfree(s, M_IPFW);
1484
1485         ctx->ipfw_state_cnt--;
1486         if (ctx->ipfw_state_loosecnt > 0)
1487                 ctx->ipfw_state_loosecnt--;
1488 }
1489
1490 static int
1491 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1492 {
1493         struct ipfw_state *s, *anchor;
1494         int expired;
1495
1496         if (reap_max < ipfw_state_reap_min)
1497                 reap_max = ipfw_state_reap_min;
1498
1499         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1500                 /*
1501                  * Kick start state expiring.  Ignore scan limit,
1502                  * we are short of states.
1503                  */
1504                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1505                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1506                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1507                 return (expired);
1508         }
1509
1510         /*
1511          * States are being expired.
1512          */
1513
1514         if (ctx->ipfw_state_cnt == 0)
1515                 return (0);
1516
1517         expired = 0;
1518         anchor = &ctx->ipfw_stateexp_anch;
1519         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1520                 /*
1521                  * Ignore scan limit; we are short of states.
1522                  */
1523
1524                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1525                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1526
1527                 if (s->st_type == O_ANCHOR)
1528                         continue;
1529
1530                 if (IPFW_STATE_TCPCLOSED(s) ||
1531                     TIME_LEQ(s->st_expire, time_uptime)) {
1532                         ipfw_state_del(ctx, s);
1533                         if (++expired >= reap_max)
1534                                 break;
1535                         if ((expired & 0xff) == 0 &&
1536                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1537                             ipfw_state_max)
1538                                 break;
1539                 }
1540         }
1541         /*
1542          * NOTE:
1543          * Leave the anchor on the list, even if the end of the list has
1544          * been reached.  ipfw_state_expire_more_dispatch() will handle
1545          * the removal.
1546          */
1547         return (expired);
1548 }
1549
1550 static void
1551 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1552 {
1553         struct ipfw_state *s, *sn;
1554
1555         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1556                 if (s->st_type == O_ANCHOR)
1557                         continue;
1558                 if (rule != NULL && s->st_rule != rule)
1559                         continue;
1560                 ipfw_state_del(ctx, s);
1561         }
1562 }
1563
1564 static void
1565 ipfw_state_expire_done(struct ipfw_context *ctx)
1566 {
1567
1568         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1569             ("stateexp is not in progress"));
1570         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1571         callout_reset(&ctx->ipfw_stateto_ch, hz,
1572             ipfw_state_expire_ipifunc, NULL);
1573 }
1574
1575 static void
1576 ipfw_state_expire_more(struct ipfw_context *ctx)
1577 {
1578         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1579
1580         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1581             ("stateexp is not in progress"));
1582         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1583             ("stateexp more did not finish"));
1584         netisr_sendmsg_oncpu(nm);
1585 }
1586
1587 static int
1588 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1589     int scan_max, int expire_max)
1590 {
1591         struct ipfw_state *s;
1592         int scanned = 0, expired = 0;
1593
1594         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1595             ("stateexp is not in progress"));
1596
1597         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1598                 if (scanned++ >= scan_max) {
1599                         ipfw_state_expire_more(ctx);
1600                         return (expired);
1601                 }
1602
1603                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1604                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1605
1606                 if (s->st_type == O_ANCHOR)
1607                         continue;
1608
1609                 if (TIME_LEQ(s->st_expire, time_uptime) ||
1610                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1611                      IPFW_STATE_TCPCLOSED(s))) {
1612                         ipfw_state_del(ctx, s);
1613                         if (++expired >= expire_max) {
1614                                 ipfw_state_expire_more(ctx);
1615                                 return (expired);
1616                         }
1617                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1618                             (expired & 0xff) == 0 &&
1619                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1620                             ipfw_state_max) {
1621                                 ipfw_state_expire_more(ctx);
1622                                 return (expired);
1623                         }
1624                 }
1625         }
1626         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1627         ipfw_state_expire_done(ctx);
1628         return (expired);
1629 }
1630
1631 static void
1632 ipfw_state_expire_more_dispatch(netmsg_t nm)
1633 {
1634         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1635         struct ipfw_state *anchor;
1636
1637         ASSERT_NETISR_NCPUS(mycpuid);
1638         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1639             ("statexp is not in progress"));
1640
1641         /* Reply ASAP */
1642         netisr_replymsg(&nm->base, 0);
1643
1644         anchor = &ctx->ipfw_stateexp_anch;
1645         if (ctx->ipfw_state_cnt == 0) {
1646                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1647                 ipfw_state_expire_done(ctx);
1648                 return;
1649         }
1650         ipfw_state_expire_loop(ctx, anchor,
1651             ipfw_state_scan_max, ipfw_state_expire_max);
1652 }
1653
1654 static int
1655 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1656 {
1657         struct ipfw_state *anchor;
1658
1659         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1660             ("stateexp is in progress"));
1661         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1662
1663         if (ctx->ipfw_state_cnt == 0) {
1664                 ipfw_state_expire_done(ctx);
1665                 return (0);
1666         }
1667
1668         /*
1669          * Do not expire more than once per second, it is useless.
1670          */
1671         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1672             ctx->ipfw_state_lastexp == time_uptime) {
1673                 ipfw_state_expire_done(ctx);
1674                 return (0);
1675         }
1676         ctx->ipfw_state_lastexp = time_uptime;
1677
1678         anchor = &ctx->ipfw_stateexp_anch;
1679         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1680         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1681 }
1682
1683 static void
1684 ipfw_state_expire_dispatch(netmsg_t nm)
1685 {
1686         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1687
1688         ASSERT_NETISR_NCPUS(mycpuid);
1689
1690         /* Reply ASAP */
1691         crit_enter();
1692         netisr_replymsg(&nm->base, 0);
1693         crit_exit();
1694
1695         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1696                 /* Running; done. */
1697                 return;
1698         }
1699         ipfw_state_expire_start(ctx,
1700             ipfw_state_scan_max, ipfw_state_expire_max);
1701 }
1702
1703 static void
1704 ipfw_state_expire_ipifunc(void *dummy __unused)
1705 {
1706         struct netmsg_base *msg;
1707
1708         KKASSERT(mycpuid < netisr_ncpus);
1709         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
1710
1711         crit_enter();
1712         if (msg->lmsg.ms_flags & MSGF_DONE)
1713                 netisr_sendmsg_oncpu(msg);
1714         crit_exit();
1715 }
1716
1717 static boolean_t
1718 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
1719 {
1720         uint32_t seq = ntohl(tcp->th_seq);
1721         uint32_t ack = ntohl(tcp->th_ack);
1722
1723         if (tcp->th_flags & TH_RST)
1724                 return (TRUE);
1725
1726         if (dir == MATCH_FORWARD) {
1727                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
1728                         s->st_flags |= IPFW_STATE_F_SEQFWD;
1729                         s->st_seq_fwd = seq;
1730                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
1731                         s->st_seq_fwd = seq;
1732                 } else {
1733                         /* Out-of-sequence; done. */
1734                         return (FALSE);
1735                 }
1736                 if (tcp->th_flags & TH_ACK) {
1737                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
1738                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
1739                                 s->st_ack_fwd = ack;
1740                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
1741                                 s->st_ack_fwd = ack;
1742                         } else {
1743                                 /* Out-of-sequence; done. */
1744                                 return (FALSE);
1745                         }
1746
1747                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
1748                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
1749                                 s->st_state |= (TH_ACK << 8);
1750                 }
1751         } else {
1752                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
1753                         s->st_flags |= IPFW_STATE_F_SEQREV;
1754                         s->st_seq_rev = seq;
1755                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
1756                         s->st_seq_rev = seq;
1757                 } else {
1758                         /* Out-of-sequence; done. */
1759                         return (FALSE);
1760                 }
1761                 if (tcp->th_flags & TH_ACK) {
1762                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
1763                                 s->st_flags |= IPFW_STATE_F_ACKREV;
1764                                 s->st_ack_rev= ack;
1765                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
1766                                 s->st_ack_rev = ack;
1767                         } else {
1768                                 /* Out-of-sequence; done. */
1769                                 return (FALSE);
1770                         }
1771
1772                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
1773                             s->st_ack_rev == s->st_seq_fwd + 1)
1774                                 s->st_state |= TH_ACK;
1775                 }
1776         }
1777         return (TRUE);
1778 }
1779
1780 static void
1781 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
1782     const struct tcphdr *tcp, struct ipfw_state *s)
1783 {
1784
1785         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
1786                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
1787
1788                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
1789                         return;
1790
1791                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
1792                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
1793                 case TH_SYN:                            /* opening */
1794                         s->st_expire = time_uptime + dyn_syn_lifetime;
1795                         break;
1796
1797                 case BOTH_SYN:                  /* move to established */
1798                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
1799                 case BOTH_SYN | (TH_FIN << 8):
1800                         s->st_expire = time_uptime + dyn_ack_lifetime;
1801                         break;
1802
1803                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
1804                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
1805                                 /* And both FINs were ACKed. */
1806                                 s->st_expire = time_uptime + dyn_fin_lifetime;
1807                         } else {
1808                                 s->st_expire = time_uptime +
1809                                     dyn_finwait_lifetime;
1810                         }
1811                         break;
1812
1813                 default:
1814 #if 0
1815                         /*
1816                          * reset or some invalid combination, but can also
1817                          * occur if we use keep-state the wrong way.
1818                          */
1819                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
1820                                 kprintf("invalid state: 0x%x\n", s->st_state);
1821 #endif
1822                         s->st_expire = time_uptime + dyn_rst_lifetime;
1823                         break;
1824                 }
1825         } else if (pkt->proto == IPPROTO_UDP) {
1826                 s->st_expire = time_uptime + dyn_udp_lifetime;
1827         } else {
1828                 /* other protocols */
1829                 s->st_expire = time_uptime + dyn_short_lifetime;
1830         }
1831 }
1832
1833 /*
1834  * Lookup a state.
1835  */
1836 static struct ipfw_state *
1837 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1838     int *match_direction, const struct tcphdr *tcp)
1839 {
1840         struct ipfw_state *key, *s;
1841         int dir = MATCH_NONE;
1842
1843         key = &ctx->ipfw_state_tmpkey;
1844         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
1845             pkt->dst_ip, pkt->dst_port, pkt->proto);
1846         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
1847         if (s == NULL)
1848                 goto done; /* not found. */
1849         if (TIME_LEQ(s->st_expire, time_uptime)) {
1850                 /* Expired. */
1851                 ipfw_state_del(ctx, s);
1852                 s = NULL;
1853                 goto done;
1854         }
1855         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
1856                 /* TCP ports recycling is too fast. */
1857                 ctx->ipfw_sts_tcprecycled++;
1858                 ipfw_state_del(ctx, s);
1859                 s = NULL;
1860                 goto done;
1861         }
1862
1863         if (s->st_swap == key->st_swap) {
1864                 dir = MATCH_FORWARD;
1865         } else {
1866                 KASSERT((s->st_swap & key->st_swap) == 0,
1867                     ("found mismatch state"));
1868                 dir = MATCH_REVERSE;
1869         }
1870
1871         /* Update this state. */
1872         ipfw_state_update(pkt, dir, tcp, s);
1873
1874         if (s->st_track != NULL) {
1875                 /* This track has been used. */
1876                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
1877         }
1878 done:
1879         if (match_direction)
1880                 *match_direction = dir;
1881         return (s);
1882 }
1883
1884 static __inline struct ip_fw *
1885 ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1886     int *match_direction, const struct tcphdr *tcp, uint16_t len)
1887 {
1888         struct ipfw_state *s;
1889
1890         s = ipfw_state_lookup(ctx, pkt, match_direction, tcp);
1891         if (s == NULL)
1892                 return (NULL);
1893
1894         KASSERT(s->st_rule->cpuid == mycpuid,
1895             ("rule %p (cpu%d) does not belong to the current cpu%d",
1896              s->st_rule, s->st_rule->cpuid, mycpuid));
1897
1898         s->st_pcnt++;
1899         s->st_bcnt += len;
1900
1901         return (s->st_rule);
1902 }
1903
1904 static struct ipfw_state *
1905 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
1906     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
1907     const struct tcphdr *tcp)
1908 {
1909         struct ipfw_state *s, *dup;
1910
1911         KASSERT(type == O_KEEP_STATE || type == O_LIMIT,
1912             ("invalid state type %u", type));
1913
1914         s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
1915         if (s == NULL) {
1916                 ctx->ipfw_sts_nomem++;
1917                 return (NULL);
1918         }
1919
1920         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
1921             id->dst_ip, id->dst_port, id->proto);
1922
1923         s->st_rule = rule;
1924         s->st_type = type;
1925
1926         ctx->ipfw_state_cnt++;
1927         ctx->ipfw_state_loosecnt++;
1928         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
1929                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
1930                 ctx->ipfw_state_loosecnt = 0;
1931         }
1932
1933         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1934         if (dup != NULL)
1935                 panic("ipfw: state exists");
1936         TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1937
1938         /*
1939          * Update this state:
1940          * Set st_expire and st_state.
1941          */
1942         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
1943
1944         if (t != NULL) {
1945                 /* Keep the track referenced. */
1946                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
1947                 s->st_track = t;
1948         }
1949         return (s);
1950 }
1951
1952 static boolean_t
1953 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
1954 {
1955         struct ipfw_trkcnt *trk;
1956         boolean_t trk_freed = FALSE;
1957
1958         KASSERT(t->t_count != NULL, ("track anchor"));
1959         KASSERT(LIST_EMPTY(&t->t_state_list),
1960             ("invalid track is still referenced"));
1961
1962         trk = t->t_trkcnt;
1963         KASSERT(trk != NULL, ("track has no trkcnt"));
1964
1965         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
1966         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
1967         kfree(t, M_IPFW);
1968
1969         /*
1970          * fdrop() style reference counting.
1971          * See kern/kern_descrip.c fdrop().
1972          */
1973         for (;;) {
1974                 int refs = trk->tc_refs;
1975
1976                 cpu_ccfence();
1977                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
1978                 if (refs == 1) {
1979                         IPFW_TRKCNT_TOKGET;
1980                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
1981                                 KASSERT(trk->tc_count == 0,
1982                                     ("%d states reference this trkcnt",
1983                                      trk->tc_count));
1984                                 RB_REMOVE(ipfw_trkcnt_tree,
1985                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
1986
1987                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
1988                                     ("invalid trkcnt cnt %d",
1989                                      ipfw_gd.ipfw_trkcnt_cnt));
1990                                 ipfw_gd.ipfw_trkcnt_cnt--;
1991                                 IPFW_TRKCNT_TOKREL;
1992
1993                                 if (ctx->ipfw_trkcnt_spare == NULL)
1994                                         ctx->ipfw_trkcnt_spare = trk;
1995                                 else
1996                                         kfree(trk, M_IPFW);
1997                                 trk_freed = TRUE;
1998                                 break; /* done! */
1999                         }
2000                         IPFW_TRKCNT_TOKREL;
2001                         /* retry */
2002                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2003                         break; /* done! */
2004                 }
2005                 /* retry */
2006         }
2007         return (trk_freed);
2008 }
2009
2010 static void
2011 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2012 {
2013         struct ipfw_track *t, *tn;
2014
2015         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2016                 if (t->t_count == NULL) /* anchor */
2017                         continue;
2018                 if (rule != NULL && t->t_rule != rule)
2019                         continue;
2020                 ipfw_track_free(ctx, t);
2021         }
2022 }
2023
2024 static boolean_t
2025 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2026     boolean_t reap)
2027 {
2028         struct ipfw_state *s, *sn;
2029         boolean_t ret = FALSE;
2030
2031         KASSERT(t->t_count != NULL, ("track anchor"));
2032
2033         if (LIST_EMPTY(&t->t_state_list))
2034                 return (FALSE);
2035
2036         /*
2037          * Do not expire more than once per second, it is useless.
2038          */
2039         if (t->t_lastexp == time_uptime)
2040                 return (FALSE);
2041         t->t_lastexp = time_uptime;
2042
2043         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2044                 if (TIME_LEQ(s->st_expire, time_uptime) ||
2045                     (reap && IPFW_STATE_TCPCLOSED(s))) {
2046                         KASSERT(s->st_track == t,
2047                             ("state track %p does not match %p",
2048                              s->st_track, t));
2049                         ipfw_state_del(ctx, s);
2050                         ret = TRUE;
2051                 }
2052         }
2053         return (ret);
2054 }
2055
2056 static __inline struct ipfw_trkcnt *
2057 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2058 {
2059         struct ipfw_trkcnt *trk;
2060
2061         if (ctx->ipfw_trkcnt_spare != NULL) {
2062                 trk = ctx->ipfw_trkcnt_spare;
2063                 ctx->ipfw_trkcnt_spare = NULL;
2064         } else {
2065                 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2066                     M_INTWAIT | M_NULLOK);
2067         }
2068         return (trk);
2069 }
2070
2071 static void
2072 ipfw_track_expire_done(struct ipfw_context *ctx)
2073 {
2074
2075         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2076             ("trackexp is not in progress"));
2077         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2078         callout_reset(&ctx->ipfw_trackto_ch, hz,
2079             ipfw_track_expire_ipifunc, NULL);
2080 }
2081
2082 static void
2083 ipfw_track_expire_more(struct ipfw_context *ctx)
2084 {
2085         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2086
2087         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2088             ("trackexp is not in progress"));
2089         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2090             ("trackexp more did not finish"));
2091         netisr_sendmsg_oncpu(nm);
2092 }
2093
2094 static int
2095 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2096     int scan_max, int expire_max)
2097 {
2098         struct ipfw_track *t;
2099         int scanned = 0, expired = 0;
2100         boolean_t reap = FALSE;
2101
2102         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2103             ("trackexp is not in progress"));
2104
2105         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2106                 reap = TRUE;
2107
2108         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2109                 if (scanned++ >= scan_max) {
2110                         ipfw_track_expire_more(ctx);
2111                         return (expired);
2112                 }
2113
2114                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2115                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2116
2117                 if (t->t_count == NULL) /* anchor */
2118                         continue;
2119
2120                 ipfw_track_state_expire(ctx, t, reap);
2121                 if (!LIST_EMPTY(&t->t_state_list)) {
2122                         /* There are states referencing this track. */
2123                         continue;
2124                 }
2125
2126                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2127                         /* Expired. */
2128                         if (ipfw_track_free(ctx, t)) {
2129                                 if (++expired >= expire_max) {
2130                                         ipfw_track_expire_more(ctx);
2131                                         return (expired);
2132                                 }
2133                         }
2134                 }
2135         }
2136         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2137         ipfw_track_expire_done(ctx);
2138         return (expired);
2139 }
2140
2141 static int
2142 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2143 {
2144         struct ipfw_track *anchor;
2145
2146         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2147             ("trackexp is in progress"));
2148         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2149
2150         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2151                 ipfw_track_expire_done(ctx);
2152                 return (0);
2153         }
2154
2155         /*
2156          * Do not expire more than once per second, it is useless.
2157          */
2158         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2159             ctx->ipfw_track_lastexp == time_uptime) {
2160                 ipfw_track_expire_done(ctx);
2161                 return (0);
2162         }
2163         ctx->ipfw_track_lastexp = time_uptime;
2164
2165         anchor = &ctx->ipfw_trackexp_anch;
2166         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2167         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2168 }
2169
2170 static void
2171 ipfw_track_expire_more_dispatch(netmsg_t nm)
2172 {
2173         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2174         struct ipfw_track *anchor;
2175
2176         ASSERT_NETISR_NCPUS(mycpuid);
2177         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2178             ("trackexp is not in progress"));
2179
2180         /* Reply ASAP */
2181         netisr_replymsg(&nm->base, 0);
2182
2183         anchor = &ctx->ipfw_trackexp_anch;
2184         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2185                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2186                 ipfw_track_expire_done(ctx);
2187                 return;
2188         }
2189         ipfw_track_expire_loop(ctx, anchor,
2190             ipfw_track_scan_max, ipfw_track_expire_max);
2191 }
2192
2193 static void
2194 ipfw_track_expire_dispatch(netmsg_t nm)
2195 {
2196         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2197
2198         ASSERT_NETISR_NCPUS(mycpuid);
2199
2200         /* Reply ASAP */
2201         crit_enter();
2202         netisr_replymsg(&nm->base, 0);
2203         crit_exit();
2204
2205         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2206                 /* Running; done. */
2207                 return;
2208         }
2209         ipfw_track_expire_start(ctx,
2210             ipfw_track_scan_max, ipfw_track_expire_max);
2211 }
2212
2213 static void
2214 ipfw_track_expire_ipifunc(void *dummy __unused)
2215 {
2216         struct netmsg_base *msg;
2217
2218         KKASSERT(mycpuid < netisr_ncpus);
2219         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2220
2221         crit_enter();
2222         if (msg->lmsg.ms_flags & MSGF_DONE)
2223                 netisr_sendmsg_oncpu(msg);
2224         crit_exit();
2225 }
2226
2227 static int
2228 ipfw_track_reap(struct ipfw_context *ctx)
2229 {
2230         struct ipfw_track *t, *anchor;
2231         int expired;
2232
2233         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2234                 /*
2235                  * Kick start track expiring.  Ignore scan limit,
2236                  * we are short of tracks.
2237                  */
2238                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2239                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2240                     ipfw_track_reap_max);
2241                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2242                 return (expired);
2243         }
2244
2245         /*
2246          * Tracks are being expired.
2247          */
2248
2249         if (RB_EMPTY(&ctx->ipfw_track_tree))
2250                 return (0);
2251
2252         expired = 0;
2253         anchor = &ctx->ipfw_trackexp_anch;
2254         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2255                 /*
2256                  * Ignore scan limit; we are short of tracks.
2257                  */
2258
2259                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2260                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2261
2262                 if (t->t_count == NULL) /* anchor */
2263                         continue;
2264
2265                 ipfw_track_state_expire(ctx, t, TRUE);
2266                 if (!LIST_EMPTY(&t->t_state_list)) {
2267                         /* There are states referencing this track. */
2268                         continue;
2269                 }
2270
2271                 if (ipfw_track_free(ctx, t)) {
2272                         if (++expired >= ipfw_track_reap_max) {
2273                                 ipfw_track_expire_more(ctx);
2274                                 break;
2275                         }
2276                 }
2277         }
2278         /*
2279          * NOTE:
2280          * Leave the anchor on the list, even if the end of the list has
2281          * been reached.  ipfw_track_expire_more_dispatch() will handle
2282          * the removal.
2283          */
2284         return (expired);
2285 }
2286
2287 static struct ipfw_track *
2288 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2289     uint16_t limit_mask, struct ip_fw *rule)
2290 {
2291         struct ipfw_track *key, *t, *dup;
2292         struct ipfw_trkcnt *trk, *ret;
2293         boolean_t do_expire = FALSE;
2294
2295         KASSERT(rule->track_ruleid != 0,
2296             ("rule %u has no track ruleid", rule->rulenum));
2297
2298         key = &ctx->ipfw_track_tmpkey;
2299         key->t_proto = id->proto;
2300         key->t_addrs = 0;
2301         key->t_ports = 0;
2302         key->t_rule = rule;
2303         if (limit_mask & DYN_SRC_ADDR)
2304                 key->t_saddr = id->src_ip;
2305         if (limit_mask & DYN_DST_ADDR)
2306                 key->t_daddr = id->dst_ip;
2307         if (limit_mask & DYN_SRC_PORT)
2308                 key->t_sport = id->src_port;
2309         if (limit_mask & DYN_DST_PORT)
2310                 key->t_dport = id->dst_port;
2311
2312         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2313         if (t != NULL)
2314                 goto done;
2315
2316         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2317         if (t == NULL) {
2318                 ctx->ipfw_tks_nomem++;
2319                 return (NULL);
2320         }
2321
2322         t->t_key = key->t_key;
2323         t->t_rule = rule;
2324         t->t_lastexp = 0;
2325         LIST_INIT(&t->t_state_list);
2326
2327         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2328                 time_t globexp, uptime;
2329
2330                 trk = NULL;
2331                 do_expire = TRUE;
2332
2333                 /*
2334                  * Do not expire globally more than once per second,
2335                  * it is useless.
2336                  */
2337                 uptime = time_uptime;
2338                 globexp = ipfw_gd.ipfw_track_globexp;
2339                 if (globexp != uptime &&
2340                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2341                     globexp, uptime)) {
2342                         int cpu;
2343
2344                         /* Expire tracks on other CPUs. */
2345                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2346                                 if (cpu == mycpuid)
2347                                         continue;
2348                                 lwkt_send_ipiq(globaldata_find(cpu),
2349                                     ipfw_track_expire_ipifunc, NULL);
2350                         }
2351                 }
2352         } else {
2353                 trk = ipfw_trkcnt_alloc(ctx);
2354         }
2355         if (trk == NULL) {
2356                 struct ipfw_trkcnt *tkey;
2357
2358                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2359                 key = NULL; /* tkey overlaps key */
2360
2361                 tkey->tc_key = t->t_key;
2362                 tkey->tc_ruleid = rule->track_ruleid;
2363
2364                 IPFW_TRKCNT_TOKGET;
2365                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2366                     tkey);
2367                 if (trk == NULL) {
2368                         IPFW_TRKCNT_TOKREL;
2369                         if (do_expire) {
2370                                 ctx->ipfw_tks_reap++;
2371                                 if (ipfw_track_reap(ctx) > 0) {
2372                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2373                                             ipfw_track_max) {
2374                                                 trk = ipfw_trkcnt_alloc(ctx);
2375                                                 if (trk != NULL)
2376                                                         goto install;
2377                                                 ctx->ipfw_tks_cntnomem++;
2378                                         } else {
2379                                                 ctx->ipfw_tks_overflow++;
2380                                         }
2381                                 } else {
2382                                         ctx->ipfw_tks_reapfailed++;
2383                                         ctx->ipfw_tks_overflow++;
2384                                 }
2385                         } else {
2386                                 ctx->ipfw_tks_cntnomem++;
2387                         }
2388                         kfree(t, M_IPFW);
2389                         return (NULL);
2390                 }
2391                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2392                     ("invalid trkcnt refs %d", trk->tc_refs));
2393                 atomic_add_int(&trk->tc_refs, 1);
2394                 IPFW_TRKCNT_TOKREL;
2395         } else {
2396 install:
2397                 trk->tc_key = t->t_key;
2398                 trk->tc_ruleid = rule->track_ruleid;
2399                 trk->tc_refs = 0;
2400                 trk->tc_count = 0;
2401                 trk->tc_expire = 0;
2402                 trk->tc_rulenum = rule->rulenum;
2403
2404                 IPFW_TRKCNT_TOKGET;
2405                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2406                     trk);
2407                 if (ret != NULL) {
2408                         KASSERT(ret->tc_refs > 0 &&
2409                             ret->tc_refs < netisr_ncpus,
2410                             ("invalid trkcnt refs %d", ret->tc_refs));
2411                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2412                             ("trkcnt spare was installed"));
2413                         ctx->ipfw_trkcnt_spare = trk;
2414                         trk = ret;
2415                 } else {
2416                         ipfw_gd.ipfw_trkcnt_cnt++;
2417                 }
2418                 atomic_add_int(&trk->tc_refs, 1);
2419                 IPFW_TRKCNT_TOKREL;
2420         }
2421         t->t_count = &trk->tc_count;
2422         t->t_trkcnt = trk;
2423
2424         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2425         if (dup != NULL)
2426                 panic("ipfw: track exists");
2427         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2428 done:
2429         t->t_expire = time_uptime + dyn_short_lifetime;
2430         return (t);
2431 }
2432
2433 /*
2434  * Install state for rule type cmd->o.opcode
2435  *
2436  * Returns 1 (failure) if state is not installed because of errors or because
2437  * states limitations are enforced.
2438  */
2439 static int
2440 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2441     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2442 {
2443         struct ipfw_state *s;
2444         struct ipfw_track *t;
2445         int count, diff;
2446
2447         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2448             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2449                 boolean_t overflow = TRUE;
2450
2451                 ctx->ipfw_sts_reap++;
2452                 if (ipfw_state_reap(ctx, diff) == 0)
2453                         ctx->ipfw_sts_reapfailed++;
2454                 if (ipfw_state_cntsync() < ipfw_state_max)
2455                         overflow = FALSE;
2456
2457                 if (overflow) {
2458                         time_t globexp, uptime;
2459                         int cpu;
2460
2461                         /*
2462                          * Do not expire globally more than once per second,
2463                          * it is useless.
2464                          */
2465                         uptime = time_uptime;
2466                         globexp = ipfw_gd.ipfw_state_globexp;
2467                         if (globexp == uptime ||
2468                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2469                             globexp, uptime)) {
2470                                 ctx->ipfw_sts_overflow++;
2471                                 return (1);
2472                         }
2473
2474                         /* Expire states on other CPUs. */
2475                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2476                                 if (cpu == mycpuid)
2477                                         continue;
2478                                 lwkt_send_ipiq(globaldata_find(cpu),
2479                                     ipfw_state_expire_ipifunc, NULL);
2480                         }
2481                         ctx->ipfw_sts_overflow++;
2482                         return (1);
2483                 }
2484         }
2485
2486         switch (cmd->o.opcode) {
2487         case O_KEEP_STATE: /* bidir rule */
2488                 s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL,
2489                     tcp);
2490                 if (s == NULL)
2491                         return (1);
2492                 break;
2493
2494         case O_LIMIT: /* limit number of sessions */
2495                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2496                 if (t == NULL)
2497                         return (1);
2498
2499                 if (*t->t_count >= cmd->conn_limit) {
2500                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2501                                 return (1);
2502                 }
2503                 for (;;) {
2504                         count = *t->t_count;
2505                         if (count >= cmd->conn_limit)
2506                                 return (1);
2507                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2508                                 break;
2509                 }
2510
2511                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2512                 if (s == NULL) {
2513                         /* Undo damage. */
2514                         atomic_subtract_int(t->t_count, 1);
2515                         return (1);
2516                 }
2517                 break;
2518
2519         default:
2520                 panic("unknown state type %u\n", cmd->o.opcode);
2521         }
2522         return (0);
2523 }
2524
2525 static int
2526 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2527     const struct in_addr *in)
2528 {
2529         struct radix_node_head *rnh;
2530         struct sockaddr_in sin;
2531         struct ipfw_tblent *te;
2532
2533         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2534         rnh = ctx->ipfw_tables[tableid];
2535         if (rnh == NULL)
2536                 return (0); /* no match */
2537
2538         memset(&sin, 0, sizeof(sin));
2539         sin.sin_family = AF_INET;
2540         sin.sin_len = sizeof(sin);
2541         sin.sin_addr = *in;
2542
2543         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2544         if (te == NULL)
2545                 return (0); /* no match */
2546
2547         te->te_use++;
2548         te->te_lastuse = time_second;
2549         return (1); /* match */
2550 }
2551
2552 /*
2553  * Transmit a TCP packet, containing either a RST or a keepalive.
2554  * When flags & TH_RST, we are sending a RST packet, because of a
2555  * "reset" action matched the packet.
2556  * Otherwise we are sending a keepalive, and flags & TH_
2557  *
2558  * Only {src,dst}_{ip,port} of "id" are used.
2559  */
2560 static void
2561 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2562 {
2563         struct mbuf *m;
2564         struct ip *ip;
2565         struct tcphdr *tcp;
2566         struct route sro;       /* fake route */
2567
2568         MGETHDR(m, M_NOWAIT, MT_HEADER);
2569         if (m == NULL)
2570                 return;
2571         m->m_pkthdr.rcvif = NULL;
2572         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2573         m->m_data += max_linkhdr;
2574
2575         ip = mtod(m, struct ip *);
2576         bzero(ip, m->m_len);
2577         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2578         ip->ip_p = IPPROTO_TCP;
2579         tcp->th_off = 5;
2580
2581         /*
2582          * Assume we are sending a RST (or a keepalive in the reverse
2583          * direction), swap src and destination addresses and ports.
2584          */
2585         ip->ip_src.s_addr = htonl(id->dst_ip);
2586         ip->ip_dst.s_addr = htonl(id->src_ip);
2587         tcp->th_sport = htons(id->dst_port);
2588         tcp->th_dport = htons(id->src_port);
2589         if (flags & TH_RST) {   /* we are sending a RST */
2590                 if (flags & TH_ACK) {
2591                         tcp->th_seq = htonl(ack);
2592                         tcp->th_ack = htonl(0);
2593                         tcp->th_flags = TH_RST;
2594                 } else {
2595                         if (flags & TH_SYN)
2596                                 seq++;
2597                         tcp->th_seq = htonl(0);
2598                         tcp->th_ack = htonl(seq);
2599                         tcp->th_flags = TH_RST | TH_ACK;
2600                 }
2601         } else {
2602                 /*
2603                  * We are sending a keepalive. flags & TH_SYN determines
2604                  * the direction, forward if set, reverse if clear.
2605                  * NOTE: seq and ack are always assumed to be correct
2606                  * as set by the caller. This may be confusing...
2607                  */
2608                 if (flags & TH_SYN) {
2609                         /*
2610                          * we have to rewrite the correct addresses!
2611                          */
2612                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2613                         ip->ip_src.s_addr = htonl(id->src_ip);
2614                         tcp->th_dport = htons(id->dst_port);
2615                         tcp->th_sport = htons(id->src_port);
2616                 }
2617                 tcp->th_seq = htonl(seq);
2618                 tcp->th_ack = htonl(ack);
2619                 tcp->th_flags = TH_ACK;
2620         }
2621
2622         /*
2623          * set ip_len to the payload size so we can compute
2624          * the tcp checksum on the pseudoheader
2625          * XXX check this, could save a couple of words ?
2626          */
2627         ip->ip_len = htons(sizeof(struct tcphdr));
2628         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2629
2630         /*
2631          * now fill fields left out earlier
2632          */
2633         ip->ip_ttl = ip_defttl;
2634         ip->ip_len = m->m_pkthdr.len;
2635
2636         bzero(&sro, sizeof(sro));
2637         ip_rtaddr(ip->ip_dst, &sro);
2638
2639         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2640         ip_output(m, NULL, &sro, 0, NULL, NULL);
2641         if (sro.ro_rt)
2642                 RTFREE(sro.ro_rt);
2643 }
2644
2645 /*
2646  * Send a reject message, consuming the mbuf passed as an argument.
2647  */
2648 static void
2649 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2650 {
2651         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2652                 /* We need the IP header in host order for icmp_error(). */
2653                 if (args->eh != NULL) {
2654                         struct ip *ip = mtod(args->m, struct ip *);
2655
2656                         ip->ip_len = ntohs(ip->ip_len);
2657                         ip->ip_off = ntohs(ip->ip_off);
2658                 }
2659                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2660         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2661                 struct tcphdr *const tcp =
2662                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2663
2664                 if ((tcp->th_flags & TH_RST) == 0) {
2665                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2666                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2667                 }
2668                 m_freem(args->m);
2669         } else {
2670                 m_freem(args->m);
2671         }
2672         args->m = NULL;
2673 }
2674
2675 /*
2676  * Given an ip_fw *, lookup_next_rule will return a pointer
2677  * to the next rule, which can be either the jump
2678  * target (for skipto instructions) or the next one in the list (in
2679  * all other cases including a missing jump target).
2680  * The result is also written in the "next_rule" field of the rule.
2681  * Backward jumps are not allowed, so start looking from the next
2682  * rule...
2683  *
2684  * This never returns NULL -- in case we do not have an exact match,
2685  * the next rule is returned. When the ruleset is changed,
2686  * pointers are flushed so we are always correct.
2687  */
2688 static struct ip_fw *
2689 lookup_next_rule(struct ip_fw *me)
2690 {
2691         struct ip_fw *rule = NULL;
2692         ipfw_insn *cmd;
2693
2694         /* look for action, in case it is a skipto */
2695         cmd = ACTION_PTR(me);
2696         if (cmd->opcode == O_LOG)
2697                 cmd += F_LEN(cmd);
2698         if (cmd->opcode == O_SKIPTO) {
2699                 for (rule = me->next; rule; rule = rule->next) {
2700                         if (rule->rulenum >= cmd->arg1)
2701                                 break;
2702                 }
2703         }
2704         if (rule == NULL)                       /* failure or not a skipto */
2705                 rule = me->next;
2706         me->next_rule = rule;
2707         return rule;
2708 }
2709
2710 static int
2711 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
2712                 enum ipfw_opcodes opcode, uid_t uid)
2713 {
2714         struct in_addr src_ip, dst_ip;
2715         struct inpcbinfo *pi;
2716         boolean_t wildcard;
2717         struct inpcb *pcb;
2718
2719         if (fid->proto == IPPROTO_TCP) {
2720                 wildcard = FALSE;
2721                 pi = &tcbinfo[mycpuid];
2722         } else if (fid->proto == IPPROTO_UDP) {
2723                 wildcard = TRUE;
2724                 pi = &udbinfo[mycpuid];
2725         } else {
2726                 return 0;
2727         }
2728
2729         /*
2730          * Values in 'fid' are in host byte order
2731          */
2732         dst_ip.s_addr = htonl(fid->dst_ip);
2733         src_ip.s_addr = htonl(fid->src_ip);
2734         if (oif) {
2735                 pcb = in_pcblookup_hash(pi,
2736                         dst_ip, htons(fid->dst_port),
2737                         src_ip, htons(fid->src_port),
2738                         wildcard, oif);
2739         } else {
2740                 pcb = in_pcblookup_hash(pi,
2741                         src_ip, htons(fid->src_port),
2742                         dst_ip, htons(fid->dst_port),
2743                         wildcard, NULL);
2744         }
2745         if (pcb == NULL || pcb->inp_socket == NULL)
2746                 return 0;
2747
2748         if (opcode == O_UID) {
2749 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
2750                 return !socheckuid(pcb->inp_socket, uid);
2751 #undef socheckuid
2752         } else  {
2753                 return groupmember(uid, pcb->inp_socket->so_cred);
2754         }
2755 }
2756
2757 static __inline int
2758 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
2759 {
2760
2761         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
2762                 struct ifaddr_container *ifac;
2763                 struct ifnet *ifp;
2764
2765                 ifp = ifunit_netisr(cmd->ifname);
2766                 if (ifp == NULL)
2767                         return (0);
2768
2769                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2770                         struct ifaddr *ia = ifac->ifa;
2771
2772                         if (ia->ifa_addr == NULL)
2773                                 continue;
2774                         if (ia->ifa_addr->sa_family != AF_INET)
2775                                 continue;
2776
2777                         cmd->mask.s_addr = INADDR_ANY;
2778                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
2779                                 cmd->mask = ((struct sockaddr_in *)
2780                                     ia->ifa_netmask)->sin_addr;
2781                         }
2782                         if (cmd->mask.s_addr == INADDR_ANY)
2783                                 cmd->mask.s_addr = INADDR_BROADCAST;
2784
2785                         cmd->addr =
2786                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
2787                         cmd->addr.s_addr &= cmd->mask.s_addr;
2788
2789                         cmd->o.arg1 |= IPFW_IFIP_VALID;
2790                         break;
2791                 }
2792                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
2793                         return (0);
2794         }
2795         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
2796 }
2797
2798 static __inline struct mbuf *
2799 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
2800     struct ip_fw_local *local, struct ip **ip0)
2801 {
2802         struct ip *ip = mtod(m, struct ip *);
2803         struct tcphdr *tcp;
2804         struct udphdr *udp;
2805
2806         /*
2807          * Collect parameters into local variables for faster matching.
2808          */
2809         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
2810                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
2811                 goto done;
2812         }
2813
2814         local->proto = args->f_id.proto = ip->ip_p;
2815         local->src_ip = ip->ip_src;
2816         local->dst_ip = ip->ip_dst;
2817         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
2818                 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
2819                 local->ip_len = ntohs(ip->ip_len);
2820         } else {
2821                 local->offset = ip->ip_off & IP_OFFMASK;
2822                 local->ip_len = ip->ip_len;
2823         }
2824
2825 #define PULLUP_TO(len)                                  \
2826 do {                                                    \
2827         if (m->m_len < (len)) {                         \
2828                 args->m = m = m_pullup(m, (len));       \
2829                 if (m == NULL) {                        \
2830                         ip = NULL;                      \
2831                         goto done;                      \
2832                 }                                       \
2833                 ip = mtod(m, struct ip *);              \
2834         }                                               \
2835 } while (0)
2836
2837         if (local->offset == 0) {
2838                 switch (local->proto) {
2839                 case IPPROTO_TCP:
2840                         PULLUP_TO(hlen + sizeof(struct tcphdr));
2841                         tcp = L3HDR(struct tcphdr, ip);
2842                         local->dst_port = tcp->th_dport;
2843                         local->src_port = tcp->th_sport;
2844                         args->f_id.flags = tcp->th_flags;
2845                         break;
2846
2847                 case IPPROTO_UDP:
2848                         PULLUP_TO(hlen + sizeof(struct udphdr));
2849                         udp = L3HDR(struct udphdr, ip);
2850                         local->dst_port = udp->uh_dport;
2851                         local->src_port = udp->uh_sport;
2852                         break;
2853
2854                 case IPPROTO_ICMP:
2855                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
2856                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
2857                         break;
2858
2859                 default:
2860                         break;
2861                 }
2862         }
2863
2864 #undef PULLUP_TO
2865
2866         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
2867         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
2868         args->f_id.src_port = local->src_port = ntohs(local->src_port);
2869         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
2870 done:
2871         *ip0 = ip;
2872         return (m);
2873 }
2874
2875 /*
2876  * The main check routine for the firewall.
2877  *
2878  * All arguments are in args so we can modify them and return them
2879  * back to the caller.
2880  *
2881  * Parameters:
2882  *
2883  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
2884  *              Starts with the IP header.
2885  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
2886  *      args->oif       Outgoing interface, or NULL if packet is incoming.
2887  *              The incoming interface is in the mbuf. (in)
2888  *
2889  *      args->rule      Pointer to the last matching rule (in/out)
2890  *      args->f_id      Addresses grabbed from the packet (out)
2891  *
2892  * Return value:
2893  *
2894  *      If the packet was denied/rejected and has been dropped, *m is equal
2895  *      to NULL upon return.
2896  *
2897  *      IP_FW_DENY      the packet must be dropped.
2898  *      IP_FW_PASS      The packet is to be accepted and routed normally.
2899  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
2900  *      IP_FW_TEE       Tee the packet to port (args->cookie)
2901  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
2902  *      IP_FW_CONTINUE  Continue processing on another cpu.
2903  */
2904 static int
2905 ipfw_chk(struct ip_fw_args *args)
2906 {
2907         /*
2908          * Local variables hold state during the processing of a packet.
2909          *
2910          * IMPORTANT NOTE: to speed up the processing of rules, there
2911          * are some assumption on the values of the variables, which
2912          * are documented here. Should you change them, please check
2913          * the implementation of the various instructions to make sure
2914          * that they still work.
2915          *
2916          * args->eh     The MAC header. It is non-null for a layer2
2917          *      packet, it is NULL for a layer-3 packet.
2918          *
2919          * m | args->m  Pointer to the mbuf, as received from the caller.
2920          *      It may change if ipfw_chk() does an m_pullup, or if it
2921          *      consumes the packet because it calls send_reject().
2922          *      XXX This has to change, so that ipfw_chk() never modifies
2923          *      or consumes the buffer.
2924          * ip   is simply an alias of the value of m, and it is kept
2925          *      in sync with it (the packet is  supposed to start with
2926          *      the ip header).
2927          */
2928         struct mbuf *m = args->m;
2929         struct ip *ip = mtod(m, struct ip *);
2930
2931         /*
2932          * oif | args->oif      If NULL, ipfw_chk has been called on the
2933          *      inbound path (ether_input, ip_input).
2934          *      If non-NULL, ipfw_chk has been called on the outbound path
2935          *      (ether_output, ip_output).
2936          */
2937         struct ifnet *oif = args->oif;
2938
2939         struct ip_fw *f = NULL;         /* matching rule */
2940         int retval = IP_FW_PASS;
2941         struct m_tag *mtag;
2942         struct divert_info *divinfo;
2943
2944         /*
2945          * hlen The length of the IPv4 header.
2946          *      hlen >0 means we have an IPv4 packet.
2947          */
2948         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
2949
2950         struct ip_fw_local lc;
2951
2952         /*
2953          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
2954          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
2955          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
2956          */
2957         int dyn_dir = MATCH_UNKNOWN;
2958         struct ip_fw *dyn_f = NULL;
2959         int cpuid = mycpuid;
2960         struct ipfw_context *ctx;
2961
2962         ASSERT_NETISR_NCPUS(cpuid);
2963         ctx = ipfw_ctx[cpuid];
2964
2965         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
2966                 return IP_FW_PASS;      /* accept */
2967
2968         if (args->eh == NULL ||         /* layer 3 packet */
2969             (m->m_pkthdr.len >= sizeof(struct ip) &&
2970              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
2971                 hlen = ip->ip_hl << 2;
2972
2973         memset(&lc, 0, sizeof(lc));
2974
2975         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
2976         if (m == NULL)
2977                 goto pullup_failed;
2978
2979         if (args->rule) {
2980                 /*
2981                  * Packet has already been tagged. Look for the next rule
2982                  * to restart processing.
2983                  *
2984                  * If fw_one_pass != 0 then just accept it.
2985                  * XXX should not happen here, but optimized out in
2986                  * the caller.
2987                  */
2988                 if (fw_one_pass && !args->cont)
2989                         return IP_FW_PASS;
2990                 args->cont = 0;
2991
2992                 /* This rule is being/has been flushed */
2993                 if (ipfw_flushing)
2994                         return IP_FW_DENY;
2995
2996                 KASSERT(args->rule->cpuid == cpuid,
2997                         ("rule used on cpu%d", cpuid));
2998
2999                 /* This rule was deleted */
3000                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3001                         return IP_FW_DENY;
3002
3003                 f = args->rule->next_rule;
3004                 if (f == NULL)
3005                         f = lookup_next_rule(args->rule);
3006         } else {
3007                 /*
3008                  * Find the starting rule. It can be either the first
3009                  * one, or the one after divert_rule if asked so.
3010                  */
3011                 int skipto;
3012
3013                 KKASSERT(!args->cont);
3014
3015                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3016                 if (mtag != NULL) {
3017                         divinfo = m_tag_data(mtag);
3018                         skipto = divinfo->skipto;
3019                 } else {
3020                         skipto = 0;
3021                 }
3022
3023                 f = ctx->ipfw_layer3_chain;
3024                 if (args->eh == NULL && skipto != 0) {
3025                         /* No skipto during rule flushing */
3026                         if (ipfw_flushing)
3027                                 return IP_FW_DENY;
3028
3029                         if (skipto >= IPFW_DEFAULT_RULE)
3030                                 return IP_FW_DENY; /* invalid */
3031
3032                         while (f && f->rulenum <= skipto)
3033                                 f = f->next;
3034                         if (f == NULL)  /* drop packet */
3035                                 return IP_FW_DENY;
3036                 } else if (ipfw_flushing) {
3037                         /* Rules are being flushed; skip to default rule */
3038                         f = ctx->ipfw_default_rule;
3039                 }
3040         }
3041         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3042                 m_tag_delete(m, mtag);
3043
3044         /*
3045          * Now scan the rules, and parse microinstructions for each rule.
3046          */
3047         for (; f; f = f->next) {
3048                 int l, cmdlen;
3049                 ipfw_insn *cmd;
3050                 int skip_or; /* skip rest of OR block */
3051
3052 again:
3053                 if (ctx->ipfw_set_disable & (1 << f->set))
3054                         continue;
3055
3056                 skip_or = 0;
3057                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
3058                      l -= cmdlen, cmd += cmdlen) {
3059                         int match;
3060
3061                         /*
3062                          * check_body is a jump target used when we find a
3063                          * CHECK_STATE, and need to jump to the body of
3064                          * the target rule.
3065                          */
3066
3067 check_body:
3068                         cmdlen = F_LEN(cmd);
3069                         /*
3070                          * An OR block (insn_1 || .. || insn_n) has the
3071                          * F_OR bit set in all but the last instruction.
3072                          * The first match will set "skip_or", and cause
3073                          * the following instructions to be skipped until
3074                          * past the one with the F_OR bit clear.
3075                          */
3076                         if (skip_or) {          /* skip this instruction */
3077                                 if ((cmd->len & F_OR) == 0)
3078                                         skip_or = 0;    /* next one is good */
3079                                 continue;
3080                         }
3081                         match = 0; /* set to 1 if we succeed */
3082
3083                         switch (cmd->opcode) {
3084                         /*
3085                          * The first set of opcodes compares the packet's
3086                          * fields with some pattern, setting 'match' if a
3087                          * match is found. At the end of the loop there is
3088                          * logic to deal with F_NOT and F_OR flags associated
3089                          * with the opcode.
3090                          */
3091                         case O_NOP:
3092                                 match = 1;
3093                                 break;
3094
3095                         case O_FORWARD_MAC:
3096                                 kprintf("ipfw: opcode %d unimplemented\n",
3097                                         cmd->opcode);
3098                                 break;
3099
3100                         case O_GID:
3101                         case O_UID:
3102                                 /*
3103                                  * We only check offset == 0 && proto != 0,
3104                                  * as this ensures that we have an IPv4
3105                                  * packet with the ports info.
3106                                  */
3107                                 if (lc.offset!=0)
3108                                         break;
3109
3110                                 match = ipfw_match_uid(&args->f_id, oif,
3111                                         cmd->opcode,
3112                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3113                                 break;
3114
3115                         case O_RECV:
3116                                 match = iface_match(m->m_pkthdr.rcvif,
3117                                     (ipfw_insn_if *)cmd);
3118                                 break;
3119
3120                         case O_XMIT:
3121                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3122                                 break;
3123
3124                         case O_VIA:
3125                                 match = iface_match(oif ? oif :
3126                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3127                                 break;
3128
3129                         case O_MACADDR2:
3130                                 if (args->eh != NULL) { /* have MAC header */
3131                                         uint32_t *want = (uint32_t *)
3132                                                 ((ipfw_insn_mac *)cmd)->addr;
3133                                         uint32_t *mask = (uint32_t *)
3134                                                 ((ipfw_insn_mac *)cmd)->mask;
3135                                         uint32_t *hdr = (uint32_t *)args->eh;
3136
3137                                         match =
3138                                         (want[0] == (hdr[0] & mask[0]) &&
3139                                          want[1] == (hdr[1] & mask[1]) &&
3140                                          want[2] == (hdr[2] & mask[2]));
3141                                 }
3142                                 break;
3143
3144                         case O_MAC_TYPE:
3145                                 if (args->eh != NULL) {
3146                                         uint16_t t =
3147                                             ntohs(args->eh->ether_type);
3148                                         uint16_t *p =
3149                                             ((ipfw_insn_u16 *)cmd)->ports;
3150                                         int i;
3151
3152                                         /* Special vlan handling */
3153                                         if (m->m_flags & M_VLANTAG)
3154                                                 t = ETHERTYPE_VLAN;
3155
3156                                         for (i = cmdlen - 1; !match && i > 0;
3157                                              i--, p += 2) {
3158                                                 match =
3159                                                 (t >= p[0] && t <= p[1]);
3160                                         }
3161                                 }
3162                                 break;
3163
3164                         case O_FRAG:
3165                                 match = (hlen > 0 && lc.offset != 0);
3166                                 break;
3167
3168                         case O_IPFRAG:
3169                                 if (hlen > 0) {
3170                                         uint16_t off;
3171
3172                                         if (args->eh != NULL)
3173                                                 off = ntohs(ip->ip_off);
3174                                         else
3175                                                 off = ip->ip_off;
3176                                         if (off & (IP_MF | IP_OFFMASK))
3177                                                 match = 1;
3178                                 }
3179                                 break;
3180
3181                         case O_IN:      /* "out" is "not in" */
3182                                 match = (oif == NULL);
3183                                 break;
3184
3185                         case O_LAYER2:
3186                                 match = (args->eh != NULL);
3187                                 break;
3188
3189                         case O_PROTO:
3190                                 /*
3191                                  * We do not allow an arg of 0 so the
3192                                  * check of "proto" only suffices.
3193                                  */
3194                                 match = (lc.proto == cmd->arg1);
3195                                 break;
3196
3197                         case O_IP_SRC:
3198                                 match = (hlen > 0 &&
3199                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3200                                     lc.src_ip.s_addr);
3201                                 break;
3202
3203                         case O_IP_SRC_MASK:
3204                                 match = (hlen > 0 &&
3205                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3206                                      (lc.src_ip.s_addr &
3207                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3208                                 break;
3209
3210                         case O_IP_SRC_ME:
3211                                 if (hlen > 0) {
3212                                         struct ifnet *tif;
3213
3214                                         tif = INADDR_TO_IFP(&lc.src_ip);
3215                                         match = (tif != NULL);
3216                                 }
3217                                 break;
3218
3219                         case O_IP_SRC_TABLE:
3220                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3221                                     &lc.src_ip);
3222                                 break;
3223
3224                         case O_IP_SRC_IFIP:
3225                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3226                                     &lc.src_ip);
3227                                 break;
3228
3229                         case O_IP_DST_SET:
3230                         case O_IP_SRC_SET:
3231                                 if (hlen > 0) {
3232                                         uint32_t *d = (uint32_t *)(cmd + 1);
3233                                         uint32_t addr =
3234                                             cmd->opcode == O_IP_DST_SET ?
3235                                                 args->f_id.dst_ip :
3236                                                 args->f_id.src_ip;
3237
3238                                         if (addr < d[0])
3239                                                 break;
3240                                         addr -= d[0]; /* subtract base */
3241                                         match =
3242                                         (addr < cmd->arg1) &&
3243                                          (d[1 + (addr >> 5)] &
3244                                           (1 << (addr & 0x1f)));
3245                                 }
3246                                 break;
3247
3248                         case O_IP_DST:
3249                                 match = (hlen > 0 &&
3250                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3251                                     lc.dst_ip.s_addr);
3252                                 break;
3253
3254                         case O_IP_DST_MASK:
3255                                 match = (hlen > 0) &&
3256                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3257                                      (lc.dst_ip.s_addr &
3258                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3259                                 break;
3260
3261                         case O_IP_DST_ME:
3262                                 if (hlen > 0) {
3263                                         struct ifnet *tif;
3264
3265                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3266                                         match = (tif != NULL);
3267                                 }
3268                                 break;
3269
3270                         case O_IP_DST_TABLE:
3271                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3272                                     &lc.dst_ip);
3273                                 break;
3274
3275                         case O_IP_DST_IFIP:
3276                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3277                                     &lc.dst_ip);
3278                                 break;
3279
3280                         case O_IP_SRCPORT:
3281                         case O_IP_DSTPORT:
3282                                 /*
3283                                  * offset == 0 && proto != 0 is enough
3284                                  * to guarantee that we have an IPv4
3285                                  * packet with port info.
3286                                  */
3287                                 if ((lc.proto==IPPROTO_UDP ||
3288                                      lc.proto==IPPROTO_TCP)
3289                                     && lc.offset == 0) {
3290                                         uint16_t x =
3291                                             (cmd->opcode == O_IP_SRCPORT) ?
3292                                                 lc.src_port : lc.dst_port;
3293                                         uint16_t *p =
3294                                             ((ipfw_insn_u16 *)cmd)->ports;
3295                                         int i;
3296
3297                                         for (i = cmdlen - 1; !match && i > 0;
3298                                              i--, p += 2) {
3299                                                 match =
3300                                                 (x >= p[0] && x <= p[1]);
3301                                         }
3302                                 }
3303                                 break;
3304
3305                         case O_ICMPTYPE:
3306                                 match = (lc.offset == 0 &&
3307                                     lc.proto==IPPROTO_ICMP &&
3308                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3309                                 break;
3310
3311                         case O_IPOPT:
3312                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3313                                 break;
3314
3315                         case O_IPVER:
3316                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3317                                 break;
3318
3319                         case O_IPTTL:
3320                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3321                                 break;
3322
3323                         case O_IPID:
3324                                 match = (hlen > 0 &&
3325                                     cmd->arg1 == ntohs(ip->ip_id));
3326                                 break;
3327
3328                         case O_IPLEN:
3329                                 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3330                                 break;
3331
3332                         case O_IPPRECEDENCE:
3333                                 match = (hlen > 0 &&
3334                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3335                                 break;
3336
3337                         case O_IPTOS:
3338                                 match = (hlen > 0 &&
3339                                     flags_match(cmd, ip->ip_tos));
3340                                 break;
3341
3342                         case O_TCPFLAGS:
3343                                 match = (lc.proto == IPPROTO_TCP &&
3344                                     lc.offset == 0 &&
3345                                     flags_match(cmd,
3346                                         L3HDR(struct tcphdr,ip)->th_flags));
3347                                 break;
3348
3349                         case O_TCPOPTS:
3350                                 match = (lc.proto == IPPROTO_TCP &&
3351                                     lc.offset == 0 && tcpopts_match(ip, cmd));
3352                                 break;
3353
3354                         case O_TCPSEQ:
3355                                 match = (lc.proto == IPPROTO_TCP &&
3356                                     lc.offset == 0 &&
3357                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3358                                         L3HDR(struct tcphdr,ip)->th_seq);
3359                                 break;
3360
3361                         case O_TCPACK:
3362                                 match = (lc.proto == IPPROTO_TCP &&
3363                                     lc.offset == 0 &&
3364                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3365                                         L3HDR(struct tcphdr,ip)->th_ack);
3366                                 break;
3367
3368                         case O_TCPWIN:
3369                                 match = (lc.proto == IPPROTO_TCP &&
3370                                     lc.offset == 0 &&
3371                                     cmd->arg1 ==
3372                                         L3HDR(struct tcphdr,ip)->th_win);
3373                                 break;
3374
3375                         case O_ESTAB:
3376                                 /* reject packets which have SYN only */
3377                                 /* XXX should i also check for TH_ACK ? */
3378                                 match = (lc.proto == IPPROTO_TCP &&
3379                                     lc.offset == 0 &&
3380                                     (L3HDR(struct tcphdr,ip)->th_flags &
3381                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3382                                 break;
3383
3384                         case O_LOG:
3385                                 if (fw_verbose) {
3386                                         ipfw_log(ctx, f, hlen, args->eh, m,
3387                                             oif);
3388                                 }
3389                                 match = 1;
3390                                 break;
3391
3392                         case O_PROB:
3393                                 match = (krandom() <
3394                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3395                                 break;
3396
3397                         /*
3398                          * The second set of opcodes represents 'actions',
3399                          * i.e. the terminal part of a rule once the packet
3400                          * matches all previous patterns.
3401                          * Typically there is only one action for each rule,
3402                          * and the opcode is stored at the end of the rule
3403                          * (but there are exceptions -- see below).
3404                          *
3405                          * In general, here we set retval and terminate the
3406                          * outer loop (would be a 'break 3' in some language,
3407                          * but we need to do a 'goto done').
3408                          *
3409                          * Exceptions:
3410                          * O_COUNT and O_SKIPTO actions:
3411                          *   instead of terminating, we jump to the next rule
3412                          *   ('goto next_rule', equivalent to a 'break 2'),
3413                          *   or to the SKIPTO target ('goto again' after
3414                          *   having set f, cmd and l), respectively.
3415                          *
3416                          * O_LIMIT and O_KEEP_STATE: these opcodes are
3417                          *   not real 'actions', and are stored right
3418                          *   before the 'action' part of the rule.
3419                          *   These opcodes try to install an entry in the
3420                          *   state tables; if successful, we continue with
3421                          *   the next opcode (match=1; break;), otherwise
3422                          *   the packet must be dropped ('goto done' after
3423                          *   setting retval).  If static rules are changed
3424                          *   during the state installation, the packet will
3425                          *   be dropped and rule's stats will not beupdated
3426                          *   ('return IP_FW_DENY').
3427                          *
3428                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3429                          *   cause a lookup of the state table, and a jump
3430                          *   to the 'action' part of the parent rule
3431                          *   ('goto check_body') if an entry is found, or
3432                          *   (CHECK_STATE only) a jump to the next rule if
3433                          *   the entry is not found ('goto next_rule').
3434                          *   The result of the lookup is cached to make
3435                          *   further instances of these opcodes are
3436                          *   effectively NOPs.  If static rules are changed
3437                          *   during the state looking up, the packet will
3438                          *   be dropped and rule's stats will not be updated
3439                          *   ('return IP_FW_DENY').
3440                          */
3441                         case O_LIMIT:
3442                         case O_KEEP_STATE:
3443                                 if (ipfw_state_install(ctx, f,
3444                                     (ipfw_insn_limit *)cmd, args,
3445                                     (lc.offset == 0 &&
3446                                      lc.proto == IPPROTO_TCP) ?
3447                                     L3HDR(struct tcphdr, ip) : NULL)) {
3448                                         retval = IP_FW_DENY;
3449                                         goto done; /* error/limit violation */
3450                                 }
3451                                 match = 1;
3452                                 break;
3453
3454                         case O_PROBE_STATE:
3455                         case O_CHECK_STATE:
3456                                 /*
3457                                  * States are checked at the first keep-state
3458                                  * check-state occurrence, with the result
3459                                  * being stored in dyn_dir.  The compiler
3460                                  * introduces a PROBE_STATE instruction for
3461                                  * us when we have a KEEP_STATE/LIMIT (because
3462                                  * PROBE_STATE needs to be run first).
3463                                  */
3464                                 if (dyn_dir == MATCH_UNKNOWN) {
3465                                         dyn_f = ipfw_state_lookup_rule(ctx,
3466                                             &args->f_id, &dyn_dir,
3467                                             (lc.offset == 0 &&
3468                                              lc.proto == IPPROTO_TCP) ?
3469                                             L3HDR(struct tcphdr, ip) : NULL,
3470                                             lc.ip_len);
3471                                         if (dyn_f != NULL) {
3472                                                 /*
3473                                                  * Found a rule from a state;
3474                                                  * jump to the 'action' part
3475                                                  * of the rule.
3476                                                  */
3477                                                 f = dyn_f;
3478                                                 cmd = ACTION_PTR(f);
3479                                                 l = f->cmd_len - f->act_ofs;
3480                                                 goto check_body;
3481                                         }
3482                                 }
3483                                 /*
3484                                  * State not found. If CHECK_STATE, skip to
3485                                  * next rule, if PROBE_STATE just ignore and
3486                                  * continue with next opcode.
3487                                  */
3488                                 if (cmd->opcode == O_CHECK_STATE)
3489                                         goto next_rule;
3490                                 match = 1;
3491                                 break;
3492
3493                         case O_ACCEPT:
3494                                 retval = IP_FW_PASS;    /* accept */
3495                                 goto done;
3496
3497                         case O_DEFRAG:
3498                                 if (f->cross_rules == NULL) {
3499                                         /*
3500                                          * This rule was not completely setup;
3501                                          * move on to the next rule.
3502                                          */
3503                                         goto next_rule;
3504                                 }
3505
3506                                 /*
3507                                  * Don't defrag for l2 packets, output packets
3508                                  * or non-fragments.
3509                                  */
3510                                 if (oif != NULL || args->eh != NULL ||
3511                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
3512                                         goto next_rule;
3513
3514                                 ctx->ipfw_frags++;
3515                                 m = ip_reass(m);
3516                                 args->m = m;
3517                                 if (m == NULL) {
3518                                         retval = IP_FW_PASS;
3519                                         goto done;
3520                                 }
3521                                 ctx->ipfw_defraged++;
3522                                 KASSERT((m->m_flags & M_HASH) == 0,
3523                                     ("hash not cleared"));
3524
3525                                 /* Update statistics */
3526                                 f->pcnt++;
3527                                 f->bcnt += lc.ip_len;
3528                                 f->timestamp = time_second;
3529
3530                                 ip = mtod(m, struct ip *);
3531                                 hlen = ip->ip_hl << 2;
3532                                 ip->ip_len += hlen;
3533
3534                                 ip->ip_len = htons(ip->ip_len);
3535                                 ip->ip_off = htons(ip->ip_off);
3536
3537                                 ip_hashfn(&m, 0);
3538                                 args->m = m;
3539                                 if (m == NULL)
3540                                         goto pullup_failed;
3541
3542                                 KASSERT(m->m_flags & M_HASH, ("no hash"));
3543                                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
3544                                 if (cpuid != mycpuid) {
3545                                         /*
3546                                          * NOTE:
3547                                          * ip_len/ip_off are in network byte
3548                                          * order.
3549                                          */
3550                                         ctx->ipfw_defrag_remote++;
3551                                         args->rule = f;
3552                                         return (IP_FW_CONTINUE);
3553                                 }
3554
3555                                 /* 'm' might be changed by ip_hashfn(). */
3556                                 ip = mtod(m, struct ip *);
3557                                 ip->ip_len = ntohs(ip->ip_len);
3558                                 ip->ip_off = ntohs(ip->ip_off);
3559
3560                                 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3561                                 if (m == NULL)
3562                                         goto pullup_failed;
3563
3564                                 /* Move on. */
3565                                 goto next_rule;
3566
3567                         case O_PIPE:
3568                         case O_QUEUE:
3569                                 args->rule = f; /* report matching rule */
3570                                 args->cookie = cmd->arg1;
3571                                 retval = IP_FW_DUMMYNET;
3572                                 goto done;
3573
3574                         case O_DIVERT:
3575                         case O_TEE:
3576                                 if (args->eh) /* not on layer 2 */
3577                                         break;
3578
3579                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
3580                                     sizeof(*divinfo), M_INTWAIT | M_NULLOK);
3581                                 if (mtag == NULL) {
3582                                         retval = IP_FW_DENY;
3583                                         goto done;
3584                                 }
3585                                 divinfo = m_tag_data(mtag);
3586
3587                                 divinfo->skipto = f->rulenum;
3588                                 divinfo->port = cmd->arg1;
3589                                 divinfo->tee = (cmd->opcode == O_TEE);
3590                                 m_tag_prepend(m, mtag);
3591
3592                                 args->cookie = cmd->arg1;
3593                                 retval = (cmd->opcode == O_DIVERT) ?
3594                                          IP_FW_DIVERT : IP_FW_TEE;
3595                                 goto done;
3596
3597                         case O_COUNT:
3598                         case O_SKIPTO:
3599                                 f->pcnt++;      /* update stats */
3600                                 f->bcnt += lc.ip_len;
3601                                 f->timestamp = time_second;
3602                                 if (cmd->opcode == O_COUNT)
3603                                         goto next_rule;
3604                                 /* handle skipto */
3605                                 if (f->next_rule == NULL)
3606                                         lookup_next_rule(f);
3607                                 f = f->next_rule;
3608                                 goto again;
3609
3610                         case O_REJECT:
3611                                 /*
3612                                  * Drop the packet and send a reject notice
3613                                  * if the packet is not ICMP (or is an ICMP
3614                                  * query), and it is not multicast/broadcast.
3615                                  */
3616                                 if (hlen > 0 &&
3617                                     (lc.proto != IPPROTO_ICMP ||
3618                                      is_icmp_query(ip)) &&
3619                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
3620                                     !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
3621                                         send_reject(args, cmd->arg1,
3622                                             lc.offset, lc.ip_len);
3623                                         retval = IP_FW_DENY;
3624                                         goto done;
3625                                 }
3626                                 /* FALLTHROUGH */
3627                         case O_DENY:
3628                                 retval = IP_FW_DENY;
3629                                 goto done;
3630
3631                         case O_FORWARD_IP:
3632                                 if (args->eh)   /* not valid on layer2 pkts */
3633                                         break;
3634                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
3635                                         struct sockaddr_in *sin;
3636
3637                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
3638                                             sizeof(*sin), M_INTWAIT | M_NULLOK);
3639                                         if (mtag == NULL) {
3640                                                 retval = IP_FW_DENY;
3641                                                 goto done;
3642                                         }
3643                                         sin = m_tag_data(mtag);
3644
3645                                         /* Structure copy */
3646                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
3647
3648                                         m_tag_prepend(m, mtag);
3649                                         m->m_pkthdr.fw_flags |=
3650                                                 IPFORWARD_MBUF_TAGGED;
3651                                         m->m_pkthdr.fw_flags &=
3652                                                 ~BRIDGE_MBUF_TAGGED;
3653                                 }
3654                                 retval = IP_FW_PASS;
3655                                 goto done;
3656
3657                         default:
3658                                 panic("-- unknown opcode %d", cmd->opcode);
3659                         } /* end of switch() on opcodes */
3660
3661                         if (cmd->len & F_NOT)
3662                                 match = !match;
3663
3664                         if (match) {
3665                                 if (cmd->len & F_OR)
3666                                         skip_or = 1;
3667                         } else {
3668                                 if (!(cmd->len & F_OR)) /* not an OR block, */
3669                                         break;          /* try next rule    */
3670                         }
3671
3672                 }       /* end of inner for, scan opcodes */
3673
3674 next_rule:;             /* try next rule                */
3675
3676         }               /* end of outer for, scan rules */
3677         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
3678         return IP_FW_DENY;
3679
3680 done:
3681         /* Update statistics */
3682         f->pcnt++;
3683         f->bcnt += lc.ip_len;
3684         f->timestamp = time_second;
3685         return retval;
3686
3687 pullup_failed:
3688         if (fw_verbose)
3689                 kprintf("pullup failed\n");
3690         return IP_FW_DENY;
3691 }
3692
3693 static struct mbuf *
3694 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
3695 {
3696         struct m_tag *mtag;
3697         struct dn_pkt *pkt;
3698         ipfw_insn *cmd;
3699         const struct ipfw_flow_id *id;
3700         struct dn_flow_id *fid;
3701
3702         M_ASSERTPKTHDR(m);
3703
3704         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
3705             M_INTWAIT | M_NULLOK);
3706         if (mtag == NULL) {
3707                 m_freem(m);
3708                 return (NULL);
3709         }
3710         m_tag_prepend(m, mtag);
3711
3712         pkt = m_tag_data(mtag);
3713         bzero(pkt, sizeof(*pkt));
3714
3715         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
3716         if (cmd->opcode == O_LOG)
3717                 cmd += F_LEN(cmd);
3718         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
3719                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
3720
3721         pkt->dn_m = m;
3722         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
3723         pkt->ifp = fwa->oif;
3724         pkt->pipe_nr = pipe_nr;
3725
3726         pkt->cpuid = mycpuid;
3727         pkt->msgport = netisr_curport();
3728
3729         id = &fwa->f_id;
3730         fid = &pkt->id;
3731         fid->fid_dst_ip = id->dst_ip;
3732         fid->fid_src_ip = id->src_ip;
3733         fid->fid_dst_port = id->dst_port;
3734         fid->fid_src_port = id->src_port;
3735         fid->fid_proto = id->proto;
3736         fid->fid_flags = id->flags;
3737
3738         ipfw_ref_rule(fwa->rule);
3739         pkt->dn_priv = fwa->rule;
3740         pkt->dn_unref_priv = ipfw_unref_rule;
3741
3742         if (cmd->opcode == O_PIPE)
3743                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
3744
3745         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
3746         return (m);
3747 }
3748
3749 /*
3750  * When a rule is added/deleted, clear the next_rule pointers in all rules.
3751  * These will be reconstructed on the fly as packets are matched.
3752  */
3753 static void
3754 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
3755 {
3756         struct ip_fw *rule;
3757
3758         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
3759                 rule->next_rule = NULL;
3760 }
3761
3762 static __inline void
3763 ipfw_inc_static_count(struct ip_fw *rule)
3764 {
3765         /* Static rule's counts are updated only on CPU0 */
3766         KKASSERT(mycpuid == 0);
3767
3768         static_count++;
3769         static_ioc_len += IOC_RULESIZE(rule);
3770 }
3771
3772 static __inline void
3773 ipfw_dec_static_count(struct ip_fw *rule)
3774 {
3775         int l = IOC_RULESIZE(rule);
3776
3777         /* Static rule's counts are updated only on CPU0 */
3778         KKASSERT(mycpuid == 0);
3779
3780         KASSERT(static_count > 0, ("invalid static count %u", static_count));
3781         static_count--;
3782
3783         KASSERT(static_ioc_len >= l,
3784                 ("invalid static len %u", static_ioc_len));
3785         static_ioc_len -= l;
3786 }
3787
3788 static void
3789 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
3790 {
3791         if (fwmsg->sibling != NULL) {
3792                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
3793                 fwmsg->sibling->sibling = rule;
3794         }
3795         fwmsg->sibling = rule;
3796 }
3797
3798 static struct ip_fw *
3799 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3800 {
3801         struct ip_fw *rule;
3802
3803         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
3804
3805         rule->act_ofs = ioc_rule->act_ofs;
3806         rule->cmd_len = ioc_rule->cmd_len;
3807         rule->rulenum = ioc_rule->rulenum;
3808         rule->set = ioc_rule->set;
3809         rule->usr_flags = ioc_rule->usr_flags;
3810
3811         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
3812
3813         rule->refcnt = 1;
3814         rule->cpuid = mycpuid;
3815         rule->rule_flags = rule_flags;
3816
3817         return rule;
3818 }
3819
3820 static void
3821 ipfw_add_rule_dispatch(netmsg_t nmsg)
3822 {
3823         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3824         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3825         struct ip_fw *rule;
3826
3827         ASSERT_NETISR_NCPUS(mycpuid);
3828
3829         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
3830
3831         /*
3832          * Insert rule into the pre-determined position
3833          */
3834         if (fwmsg->prev_rule != NULL) {
3835                 struct ip_fw *prev, *next;
3836
3837                 prev = fwmsg->prev_rule;
3838                 KKASSERT(prev->cpuid == mycpuid);
3839
3840                 next = fwmsg->next_rule;
3841                 KKASSERT(next->cpuid == mycpuid);
3842
3843                 rule->next = next;
3844                 prev->next = rule;
3845
3846                 /*
3847                  * Move to the position on the next CPU
3848                  * before the msg is forwarded.
3849                  */
3850                 fwmsg->prev_rule = prev->sibling;
3851                 fwmsg->next_rule = next->sibling;
3852         } else {
3853                 KKASSERT(fwmsg->next_rule == NULL);
3854                 rule->next = ctx->ipfw_layer3_chain;
3855                 ctx->ipfw_layer3_chain = rule;
3856         }
3857
3858         /* Link rule CPU sibling */
3859         ipfw_link_sibling(fwmsg, rule);
3860
3861         ipfw_flush_rule_ptrs(ctx);
3862
3863         if (mycpuid == 0) {
3864                 /* Statistics only need to be updated once */
3865                 ipfw_inc_static_count(rule);
3866
3867                 /* Return the rule on CPU0 */
3868                 nmsg->lmsg.u.ms_resultp = rule;
3869         }
3870
3871         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
3872                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
3873
3874         if (fwmsg->cross_rules != NULL) {
3875                 /* Save rules for later use. */
3876                 fwmsg->cross_rules[mycpuid] = rule;
3877         }
3878
3879         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3880 }
3881
3882 static void
3883 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
3884 {
3885         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3886         struct ip_fw *rule = fwmsg->sibling;
3887         int sz = sizeof(struct ip_fw *) * netisr_ncpus;
3888
3889         ASSERT_NETISR_NCPUS(mycpuid);
3890         KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
3891             ("not crossref rule"));
3892
3893         rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
3894         memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
3895
3896         fwmsg->sibling = rule->sibling;
3897         netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
3898 }
3899
3900 /*
3901  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
3902  * then possibly create a rule number and add the rule to the list.
3903  * Update the rule_number in the input struct so the caller knows
3904  * it as well.
3905  */
3906 static void
3907 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3908 {
3909         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3910         struct netmsg_ipfw fwmsg;
3911         struct ip_fw *f, *prev, *rule;
3912
3913         ASSERT_NETISR0;
3914
3915         /*
3916          * If rulenum is 0, find highest numbered rule before the
3917          * default rule, and add rule number incremental step.
3918          */
3919         if (ioc_rule->rulenum == 0) {
3920                 int step = autoinc_step;
3921
3922                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
3923                          step <= IPFW_AUTOINC_STEP_MAX);
3924
3925                 /*
3926                  * Locate the highest numbered rule before default
3927                  */
3928                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
3929                         if (f->rulenum == IPFW_DEFAULT_RULE)
3930                                 break;
3931                         ioc_rule->rulenum = f->rulenum;
3932                 }
3933                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
3934                         ioc_rule->rulenum += step;
3935         }
3936         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
3937                 ioc_rule->rulenum != 0,
3938                 ("invalid rule num %d", ioc_rule->rulenum));
3939
3940         /*
3941          * Now find the right place for the new rule in the sorted list.
3942          */
3943         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
3944              prev = f, f = f->next) {
3945                 if (f->rulenum > ioc_rule->rulenum) {
3946                         /* Found the location */
3947                         break;
3948                 }
3949         }
3950         KASSERT(f != NULL, ("no default rule?!"));
3951
3952         /*
3953          * Duplicate the rule onto each CPU.
3954          * The rule duplicated on CPU0 will be returned.
3955          */
3956         bzero(&fwmsg, sizeof(fwmsg));
3957         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3958             ipfw_add_rule_dispatch);
3959         fwmsg.ioc_rule = ioc_rule;
3960         fwmsg.prev_rule = prev;
3961         fwmsg.next_rule = prev == NULL ? NULL : f;
3962         fwmsg.rule_flags = rule_flags;
3963         if (rule_flags & IPFW_RULE_F_CROSSREF) {
3964                 fwmsg.cross_rules = kmalloc(
3965                     sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
3966                     M_WAITOK | M_ZERO);
3967         }
3968
3969         netisr_domsg_global(&fwmsg.base);
3970         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
3971
3972         rule = fwmsg.base.lmsg.u.ms_resultp;
3973         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
3974
3975         if (fwmsg.cross_rules != NULL) {
3976                 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
3977                     MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
3978                 fwmsg.sibling = rule;
3979                 netisr_domsg_global(&fwmsg.base);
3980                 KKASSERT(fwmsg.sibling == NULL);
3981
3982                 kfree(fwmsg.cross_rules, M_TEMP);
3983
3984 #ifdef KLD_MODULE
3985                 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
3986 #endif
3987         }
3988
3989         DPRINTF("++ installed rule %d, static count now %d\n",
3990                 rule->rulenum, static_count);
3991 }
3992
3993 /*
3994  * Free storage associated with a static rule (including derived
3995  * states/tracks).
3996  * The caller is in charge of clearing rule pointers to avoid
3997  * dangling pointers.
3998  * @return a pointer to the next entry.
3999  * Arguments are not checked, so they better be correct.
4000  */
4001 static struct ip_fw *
4002 ipfw_delete_rule(struct ipfw_context *ctx,
4003                  struct ip_fw *prev, struct ip_fw *rule)
4004 {
4005         struct ip_fw *n;
4006
4007         n = rule->next;
4008         if (prev == NULL)
4009                 ctx->ipfw_layer3_chain = n;
4010         else
4011                 prev->next = n;
4012
4013         /* Mark the rule as invalid */
4014         rule->rule_flags |= IPFW_RULE_F_INVALID;
4015         rule->next_rule = NULL;
4016         rule->sibling = NULL;
4017 #ifdef foo
4018         /* Don't reset cpuid here; keep various assertion working */
4019         rule->cpuid = -1;
4020 #endif
4021
4022         /* Statistics only need to be updated once */
4023         if (mycpuid == 0)
4024                 ipfw_dec_static_count(rule);
4025
4026         if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4027                 /* Try to free this rule */
4028                 ipfw_free_rule(rule);
4029         } else {
4030                 /* TODO: check staging area. */
4031                 if (mycpuid == 0) {
4032                         rule->next = ipfw_gd.ipfw_crossref_free;
4033                         ipfw_gd.ipfw_crossref_free = rule;
4034                 }
4035         }
4036
4037         /* Return the next rule */
4038         return n;
4039 }
4040
4041 static void
4042 ipfw_flush_dispatch(netmsg_t nmsg)
4043 {
4044         int kill_default = nmsg->lmsg.u.ms_result;
4045         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4046         struct ip_fw *rule;
4047
4048         ASSERT_NETISR_NCPUS(mycpuid);
4049
4050         /*
4051          * Flush states.
4052          */
4053         ipfw_state_flush(ctx, NULL);
4054         KASSERT(ctx->ipfw_state_cnt == 0,
4055             ("%d pcpu states remain", ctx->ipfw_state_cnt));
4056         ctx->ipfw_state_loosecnt = 0;
4057         ctx->ipfw_state_lastexp = 0;
4058
4059         /*
4060          * Flush tracks.
4061          */
4062         ipfw_track_flush(ctx, NULL);
4063         ctx->ipfw_track_lastexp = 0;
4064         if (ctx->ipfw_trkcnt_spare != NULL) {
4065                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4066                 ctx->ipfw_trkcnt_spare = NULL;
4067         }
4068
4069         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4070
4071         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4072                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4073                 ipfw_delete_rule(ctx, NULL, rule);
4074
4075         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4076 }
4077
4078 /*
4079  * Deletes all rules from a chain (including the default rule
4080  * if the second argument is set).
4081  */
4082 static void
4083 ipfw_flush(int kill_default)
4084 {
4085         struct netmsg_base nmsg;
4086 #ifdef INVARIANTS
4087         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4088         int state_cnt;
4089 #endif
4090
4091         ASSERT_NETISR0;
4092
4093         /*
4094          * If 'kill_default' then caller has done the necessary
4095          * msgport syncing; unnecessary to do it again.
4096          */
4097         if (!kill_default) {
4098                 /*
4099                  * Let ipfw_chk() know the rules are going to
4100                  * be flushed, so it could jump directly to
4101                  * the default rule.
4102                  */
4103                 ipfw_flushing = 1;
4104                 /* XXX use priority sync */
4105                 netmsg_service_sync();
4106         }
4107
4108         /*
4109          * Press the 'flush' button
4110          */
4111         bzero(&nmsg, sizeof(nmsg));
4112         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4113             ipfw_flush_dispatch);
4114         nmsg.lmsg.u.ms_result = kill_default;
4115         netisr_domsg_global(&nmsg);
4116         ipfw_gd.ipfw_state_loosecnt = 0;
4117         ipfw_gd.ipfw_state_globexp = 0;
4118         ipfw_gd.ipfw_track_globexp = 0;
4119
4120 #ifdef INVARIANTS
4121         state_cnt = ipfw_state_cntcoll();
4122         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4123
4124         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4125             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4126
4127         if (kill_default) {
4128                 KASSERT(static_count == 0,
4129                         ("%u static rules remain", static_count));
4130                 KASSERT(static_ioc_len == 0,
4131                         ("%u bytes of static rules remain", static_ioc_len));
4132         } else {
4133                 KASSERT(static_count == 1,
4134                         ("%u static rules remain", static_count));
4135                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4136                         ("%u bytes of static rules remain, should be %lu",
4137                          static_ioc_len,
4138                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4139         }
4140 #endif
4141
4142         /* Flush is done */
4143         ipfw_flushing = 0;
4144 }
4145
4146 static void
4147 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4148 {
4149         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4150         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4151         struct ip_fw *rule, *prev;
4152
4153         ASSERT_NETISR_NCPUS(mycpuid);
4154
4155         rule = dmsg->start_rule;
4156         KKASSERT(rule->cpuid == mycpuid);
4157         dmsg->start_rule = rule->sibling;
4158
4159         prev = dmsg->prev_rule;
4160         if (prev != NULL) {
4161                 KKASSERT(prev->cpuid == mycpuid);
4162
4163                 /*
4164                  * Move to the position on the next CPU
4165                  * before the msg is forwarded.
4166                  */
4167                 dmsg->prev_rule = prev->sibling;
4168         }
4169
4170         /*
4171          * flush pointers outside the loop, then delete all matching
4172          * rules.  'prev' remains the same throughout the cycle.
4173          */
4174         ipfw_flush_rule_ptrs(ctx);
4175         while (rule && rule->rulenum == dmsg->rulenum) {
4176                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4177                         /* Flush states generated by this rule. */
4178                         ipfw_state_flush(ctx, rule);
4179                 }
4180                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4181                         /* Flush tracks generated by this rule. */
4182                         ipfw_track_flush(ctx, rule);
4183                 }
4184                 rule = ipfw_delete_rule(ctx, prev, rule);
4185         }
4186
4187         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4188 }
4189
4190 static int
4191 ipfw_alt_delete_rule(uint16_t rulenum)
4192 {
4193         struct ip_fw *prev, *rule;
4194         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4195         struct netmsg_del dmsg;
4196
4197         ASSERT_NETISR0;
4198
4199         /*
4200          * Locate first rule to delete
4201          */
4202         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4203              rule && rule->rulenum < rulenum;
4204              prev = rule, rule = rule->next)
4205                 ; /* EMPTY */
4206         if (rule->rulenum != rulenum)
4207                 return EINVAL;
4208
4209         /*
4210          * Get rid of the rule duplications on all CPUs
4211          */
4212         bzero(&dmsg, sizeof(dmsg));
4213         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4214             ipfw_alt_delete_rule_dispatch);
4215         dmsg.prev_rule = prev;
4216         dmsg.start_rule = rule;
4217         dmsg.rulenum = rulenum;
4218
4219         netisr_domsg_global(&dmsg.base);
4220         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4221         return 0;
4222 }
4223
4224 static void
4225 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4226 {
4227         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4228         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4229         struct ip_fw *prev, *rule;
4230 #ifdef INVARIANTS
4231         int del = 0;
4232 #endif
4233
4234         ASSERT_NETISR_NCPUS(mycpuid);
4235
4236         ipfw_flush_rule_ptrs(ctx);
4237
4238         prev = NULL;
4239         rule = ctx->ipfw_layer3_chain;
4240         while (rule != NULL) {
4241                 if (rule->set == dmsg->from_set) {
4242                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4243                                 /* Flush states generated by this rule. */
4244                                 ipfw_state_flush(ctx, rule);
4245                         }
4246                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4247                                 /* Flush tracks generated by this rule. */
4248                                 ipfw_track_flush(ctx, rule);
4249                         }
4250                         rule = ipfw_delete_rule(ctx, prev, rule);
4251 #ifdef INVARIANTS
4252                         del = 1;
4253 #endif
4254                 } else {
4255                         prev = rule;
4256                         rule = rule->next;
4257                 }
4258         }
4259         KASSERT(del, ("no match set?!"));
4260
4261         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4262 }
4263
4264 static int
4265 ipfw_alt_delete_ruleset(uint8_t set)
4266 {
4267         struct netmsg_del dmsg;
4268         int del;
4269         struct ip_fw *rule;
4270         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4271
4272         ASSERT_NETISR0;
4273
4274         /*
4275          * Check whether the 'set' exists.  If it exists,
4276          * then check whether any rules within the set will
4277          * try to create states.
4278          */
4279         del = 0;
4280         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4281                 if (rule->set == set)
4282                         del = 1;
4283         }
4284         if (!del)
4285                 return 0; /* XXX EINVAL? */
4286
4287         /*
4288          * Delete this set
4289          */
4290         bzero(&dmsg, sizeof(dmsg));
4291         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4292             ipfw_alt_delete_ruleset_dispatch);
4293         dmsg.from_set = set;
4294         netisr_domsg_global(&dmsg.base);
4295
4296         return 0;
4297 }
4298
4299 static void
4300 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
4301 {
4302         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4303         struct ip_fw *rule;
4304
4305         ASSERT_NETISR_NCPUS(mycpuid);
4306
4307         rule = dmsg->start_rule;
4308         KKASSERT(rule->cpuid == mycpuid);
4309
4310         /*
4311          * Move to the position on the next CPU
4312          * before the msg is forwarded.
4313          */
4314         dmsg->start_rule = rule->sibling;
4315
4316         while (rule && rule->rulenum <= dmsg->rulenum) {
4317                 if (rule->rulenum == dmsg->rulenum)
4318                         rule->set = dmsg->to_set;
4319                 rule = rule->next;
4320         }
4321         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4322 }
4323
4324 static int
4325 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
4326 {
4327         struct netmsg_del dmsg;
4328         struct netmsg_base *nmsg;
4329         struct ip_fw *rule;
4330         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4331
4332         ASSERT_NETISR0;
4333
4334         /*
4335          * Locate first rule to move
4336          */
4337         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
4338              rule = rule->next) {
4339                 if (rule->rulenum == rulenum && rule->set != set)
4340                         break;
4341         }
4342         if (rule == NULL || rule->rulenum > rulenum)
4343                 return 0; /* XXX error? */
4344
4345         bzero(&dmsg, sizeof(dmsg));
4346         nmsg = &dmsg.base;
4347         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4348             ipfw_alt_move_rule_dispatch);
4349         dmsg.start_rule = rule;
4350         dmsg.rulenum = rulenum;
4351         dmsg.to_set = set;
4352
4353         netisr_domsg_global(nmsg);
4354         KKASSERT(dmsg.start_rule == NULL);
4355         return 0;
4356 }
4357
4358 static void
4359 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
4360 {
4361         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4362         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4363         struct ip_fw *rule;
4364
4365         ASSERT_NETISR_NCPUS(mycpuid);
4366
4367         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4368                 if (rule->set == dmsg->from_set)
4369                         rule->set = dmsg->to_set;
4370         }
4371         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4372 }
4373
4374 static int
4375 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
4376 {
4377         struct netmsg_del dmsg;
4378         struct netmsg_base *nmsg;
4379
4380         ASSERT_NETISR0;
4381
4382         bzero(&dmsg, sizeof(dmsg));
4383         nmsg = &dmsg.base;
4384         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4385             ipfw_alt_move_ruleset_dispatch);
4386         dmsg.from_set = from_set;
4387         dmsg.to_set = to_set;
4388
4389         netisr_domsg_global(nmsg);
4390         return 0;
4391 }
4392
4393 static void
4394 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
4395 {
4396         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4397         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4398         struct ip_fw *rule;
4399
4400         ASSERT_NETISR_NCPUS(mycpuid);
4401
4402         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4403                 if (rule->set == dmsg->from_set)
4404                         rule->set = dmsg->to_set;
4405                 else if (rule->set == dmsg->to_set)
4406                         rule->set = dmsg->from_set;
4407         }
4408         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4409 }
4410
4411 static int
4412 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
4413 {
4414         struct netmsg_del dmsg;
4415         struct netmsg_base *nmsg;
4416
4417         ASSERT_NETISR0;
4418
4419         bzero(&dmsg, sizeof(dmsg));
4420         nmsg = &dmsg.base;
4421         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4422             ipfw_alt_swap_ruleset_dispatch);
4423         dmsg.from_set = set1;
4424         dmsg.to_set = set2;
4425
4426         netisr_domsg_global(nmsg);
4427         return 0;
4428 }
4429
4430 /*
4431  * Remove all rules with given number, and also do set manipulation.
4432  *
4433  * The argument is an uint32_t. The low 16 bit are the rule or set number,
4434  * the next 8 bits are the new set, the top 8 bits are the command:
4435  *
4436  *      0       delete rules with given number
4437  *      1       delete rules with given set number
4438  *      2       move rules with given number to new set
4439  *      3       move rules with given set number to new set
4440  *      4       swap sets with given numbers
4441  */
4442 static int
4443 ipfw_ctl_alter(uint32_t arg)
4444 {
4445         uint16_t rulenum;
4446         uint8_t cmd, new_set;
4447         int error = 0;
4448
4449         ASSERT_NETISR0;
4450
4451         rulenum = arg & 0xffff;
4452         cmd = (arg >> 24) & 0xff;
4453         new_set = (arg >> 16) & 0xff;
4454
4455         if (cmd > 4)
4456                 return EINVAL;
4457         if (new_set >= IPFW_DEFAULT_SET)
4458                 return EINVAL;
4459         if (cmd == 0 || cmd == 2) {
4460                 if (rulenum == IPFW_DEFAULT_RULE)
4461                         return EINVAL;
4462         } else {
4463                 if (rulenum >= IPFW_DEFAULT_SET)
4464                         return EINVAL;
4465         }
4466
4467         switch (cmd) {
4468         case 0: /* delete rules with given number */
4469                 error = ipfw_alt_delete_rule(rulenum);
4470                 break;
4471
4472         case 1: /* delete all rules with given set number */
4473                 error = ipfw_alt_delete_ruleset(rulenum);
4474                 break;
4475
4476         case 2: /* move rules with given number to new set */
4477                 error = ipfw_alt_move_rule(rulenum, new_set);
4478                 break;
4479
4480         case 3: /* move rules with given set number to new set */
4481                 error = ipfw_alt_move_ruleset(rulenum, new_set);
4482                 break;
4483
4484         case 4: /* swap two sets */
4485                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
4486                 break;
4487         }
4488         return error;
4489 }
4490
4491 /*
4492  * Clear counters for a specific rule.
4493  */
4494 static void
4495 clear_counters(struct ip_fw *rule, int log_only)
4496 {
4497         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
4498
4499         if (log_only == 0) {
4500                 rule->bcnt = rule->pcnt = 0;
4501                 rule->timestamp = 0;
4502         }
4503         if (l->o.opcode == O_LOG)
4504                 l->log_left = l->max_log;
4505 }
4506
4507 static void
4508 ipfw_zero_entry_dispatch(netmsg_t nmsg)
4509 {
4510         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
4511         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4512         struct ip_fw *rule;
4513
4514         ASSERT_NETISR_NCPUS(mycpuid);
4515
4516         if (zmsg->rulenum == 0) {
4517                 KKASSERT(zmsg->start_rule == NULL);
4518
4519                 ctx->ipfw_norule_counter = 0;
4520                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4521                         clear_counters(rule, zmsg->log_only);
4522         } else {
4523                 struct ip_fw *start = zmsg->start_rule;
4524
4525                 KKASSERT(start->cpuid == mycpuid);
4526                 KKASSERT(start->rulenum == zmsg->rulenum);
4527
4528                 /*
4529                  * We can have multiple rules with the same number, so we
4530                  * need to clear them all.
4531                  */
4532                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
4533                      rule = rule->next)
4534                         clear_counters(rule, zmsg->log_only);
4535
4536                 /*
4537                  * Move to the position on the next CPU
4538                  * before the msg is forwarded.
4539                  */
4540                 zmsg->start_rule = start->sibling;
4541         }
4542         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4543 }
4544
4545 /*
4546  * Reset some or all counters on firewall rules.
4547  * @arg frwl is null to clear all entries, or contains a specific
4548  * rule number.
4549  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
4550  */
4551 static int
4552 ipfw_ctl_zero_entry(int rulenum, int log_only)
4553 {
4554         struct netmsg_zent zmsg;
4555         struct netmsg_base *nmsg;
4556         const char *msg;
4557         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4558
4559         ASSERT_NETISR0;
4560
4561         bzero(&zmsg, sizeof(zmsg));
4562         nmsg = &zmsg.base;
4563         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4564             ipfw_zero_entry_dispatch);
4565         zmsg.log_only = log_only;
4566
4567         if (rulenum == 0) {
4568                 msg = log_only ? "ipfw: All logging counts reset.\n"
4569                                : "ipfw: Accounting cleared.\n";
4570         } else {
4571                 struct ip_fw *rule;
4572
4573                 /*
4574                  * Locate the first rule with 'rulenum'
4575                  */
4576                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4577                         if (rule->rulenum == rulenum)
4578                                 break;
4579                 }
4580                 if (rule == NULL) /* we did not find any matching rules */
4581                         return (EINVAL);
4582                 zmsg.start_rule = rule;
4583                 zmsg.rulenum = rulenum;
4584
4585                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
4586                                : "ipfw: Entry %d cleared.\n";
4587         }
4588         netisr_domsg_global(nmsg);
4589         KKASSERT(zmsg.start_rule == NULL);
4590
4591         if (fw_verbose)
4592                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
4593         return (0);
4594 }
4595
4596 /*
4597  * Check validity of the structure before insert.
4598  * Fortunately rules are simple, so this mostly need to check rule sizes.
4599  */
4600 static int
4601 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
4602 {
4603         int l, cmdlen = 0;
4604         int have_action = 0;
4605         ipfw_insn *cmd;
4606
4607         *rule_flags = 0;
4608
4609         /* Check for valid size */
4610         if (size < sizeof(*rule)) {
4611                 kprintf("ipfw: rule too short\n");
4612                 return EINVAL;
4613         }
4614         l = IOC_RULESIZE(rule);
4615         if (l != size) {
4616                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
4617                 return EINVAL;
4618         }
4619
4620         /* Check rule number */
4621         if (rule->rulenum == IPFW_DEFAULT_RULE) {
4622                 kprintf("ipfw: invalid rule number\n");
4623                 return EINVAL;
4624         }
4625
4626         /*
4627          * Now go for the individual checks. Very simple ones, basically only
4628          * instruction sizes.
4629          */
4630         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
4631              l -= cmdlen, cmd += cmdlen) {
4632                 cmdlen = F_LEN(cmd);
4633                 if (cmdlen > l) {
4634                         kprintf("ipfw: opcode %d size truncated\n",
4635                                 cmd->opcode);
4636                         return EINVAL;
4637                 }
4638
4639                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
4640
4641                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT) {
4642                         /* This rule will generate states. */
4643                         *rule_flags |= IPFW_RULE_F_GENSTATE;
4644                         if (cmd->opcode == O_LIMIT)
4645                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
4646                 }
4647                 if (cmd->opcode == O_DEFRAG)
4648                         *rule_flags |= IPFW_RULE_F_CROSSREF;
4649                 if (cmd->opcode == O_IP_SRC_IFIP ||
4650                     cmd->opcode == O_IP_DST_IFIP) {
4651                         *rule_flags |= IPFW_RULE_F_DYNIFADDR;
4652                         cmd->arg1 &= IPFW_IFIP_SETTINGS;
4653                 }
4654
4655                 switch (cmd->opcode) {
4656                 case O_NOP:
4657                 case O_PROBE_STATE:
4658                 case O_KEEP_STATE:
4659                 case O_PROTO:
4660                 case O_IP_SRC_ME:
4661                 case O_IP_DST_ME:
4662                 case O_LAYER2:
4663                 case O_IN:
4664                 case O_FRAG:
4665                 case O_IPFRAG:
4666                 case O_IPOPT:
4667                 case O_IPLEN:
4668                 case O_IPID:
4669                 case O_IPTOS:
4670                 case O_IPPRECEDENCE:
4671                 case O_IPTTL:
4672                 case O_IPVER:
4673                 case O_TCPWIN:
4674                 case O_TCPFLAGS:
4675                 case O_TCPOPTS:
4676                 case O_ESTAB:
4677                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4678                                 goto bad_size;
4679                         break;
4680
4681                 case O_IP_SRC_TABLE:
4682                 case O_IP_DST_TABLE:
4683                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4684                                 goto bad_size;
4685                         if (cmd->arg1 >= ipfw_table_max) {
4686                                 kprintf("ipfw: invalid table id %u, max %d\n",
4687                                     cmd->arg1, ipfw_table_max);
4688                                 return EINVAL;
4689                         }
4690                         break;
4691
4692                 case O_IP_SRC_IFIP:
4693                 case O_IP_DST_IFIP:
4694                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
4695                                 goto bad_size;
4696                         break;
4697
4698                 case O_UID:
4699                 case O_GID:
4700                 case O_IP_SRC:
4701                 case O_IP_DST:
4702                 case O_TCPSEQ:
4703                 case O_TCPACK:
4704                 case O_PROB:
4705                 case O_ICMPTYPE:
4706                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
4707                                 goto bad_size;
4708                         break;
4709
4710                 case O_LIMIT:
4711                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
4712                                 goto bad_size;
4713                         break;
4714
4715                 case O_LOG:
4716                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
4717                                 goto bad_size;
4718
4719                         ((ipfw_insn_log *)cmd)->log_left =
4720                             ((ipfw_insn_log *)cmd)->max_log;
4721
4722                         break;
4723
4724                 case O_IP_SRC_MASK:
4725                 case O_IP_DST_MASK:
4726                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
4727                                 goto bad_size;
4728                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
4729                                 kprintf("ipfw: opcode %d, useless rule\n",
4730                                         cmd->opcode);
4731                                 return EINVAL;
4732                         }
4733                         break;
4734
4735                 case O_IP_SRC_SET:
4736                 case O_IP_DST_SET:
4737                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
4738                                 kprintf("ipfw: invalid set size %d\n",
4739                                         cmd->arg1);
4740                                 return EINVAL;
4741                         }
4742                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
4743                             (cmd->arg1+31)/32 )
4744                                 goto bad_size;
4745                         break;
4746
4747                 case O_MACADDR2:
4748                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
4749                                 goto bad_size;
4750                         break;
4751
4752                 case O_MAC_TYPE:
4753                 case O_IP_SRCPORT:
4754                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
4755                         if (cmdlen < 2 || cmdlen > 31)
4756                                 goto bad_size;
4757                         break;
4758
4759                 case O_RECV:
4760                 case O_XMIT:
4761                 case O_VIA:
4762                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
4763                                 goto bad_size;
4764                         break;
4765
4766                 case O_PIPE:
4767                 case O_QUEUE:
4768                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
4769                                 goto bad_size;
4770                         goto check_action;
4771
4772                 case O_FORWARD_IP:
4773                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
4774                                 goto bad_size;
4775                         } else {
4776                                 in_addr_t fwd_addr;
4777
4778                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
4779                                            sa.sin_addr.s_addr;
4780                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
4781                                         kprintf("ipfw: try forwarding to "
4782                                                 "multicast address\n");
4783                                         return EINVAL;
4784                                 }
4785                         }
4786                         goto check_action;
4787
4788                 case O_FORWARD_MAC: /* XXX not implemented yet */
4789                 case O_CHECK_STATE:
4790                 case O_COUNT:
4791                 case O_ACCEPT:
4792                 case O_DENY:
4793                 case O_REJECT:
4794                 case O_SKIPTO:
4795                 case O_DIVERT:
4796                 case O_TEE:
4797                 case O_DEFRAG:
4798                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4799                                 goto bad_size;
4800 check_action:
4801                         if (have_action) {
4802                                 kprintf("ipfw: opcode %d, multiple actions"
4803                                         " not allowed\n",
4804                                         cmd->opcode);
4805                                 return EINVAL;
4806                         }
4807                         have_action = 1;
4808                         if (l != cmdlen) {
4809                                 kprintf("ipfw: opcode %d, action must be"
4810                                         " last opcode\n",
4811                                         cmd->opcode);
4812                                 return EINVAL;
4813                         }
4814                         break;
4815                 default:
4816                         kprintf("ipfw: opcode %d, unknown opcode\n",
4817                                 cmd->opcode);
4818                         return EINVAL;
4819                 }
4820         }
4821         if (have_action == 0) {
4822                 kprintf("ipfw: missing action\n");
4823                 return EINVAL;
4824         }
4825         return 0;
4826
4827 bad_size:
4828         kprintf("ipfw: opcode %d size %d wrong\n",
4829                 cmd->opcode, cmdlen);
4830         return EINVAL;
4831 }
4832
4833 static int
4834 ipfw_ctl_add_rule(struct sockopt *sopt)
4835 {
4836         struct ipfw_ioc_rule *ioc_rule;
4837         size_t size;
4838         uint32_t rule_flags;
4839         int error;
4840
4841         ASSERT_NETISR0;
4842
4843         size = sopt->sopt_valsize;
4844         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
4845             size < sizeof(*ioc_rule)) {
4846                 return EINVAL;
4847         }
4848         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
4849                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
4850                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
4851         }
4852         ioc_rule = sopt->sopt_val;
4853
4854         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
4855         if (error)
4856                 return error;
4857
4858         ipfw_add_rule(ioc_rule, rule_flags);
4859
4860         if (sopt->sopt_dir == SOPT_GET)
4861                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
4862         return 0;
4863 }
4864
4865 static void *
4866 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
4867     struct ipfw_ioc_rule *ioc_rule)
4868 {
4869         const struct ip_fw *sibling;
4870 #ifdef INVARIANTS
4871         int i;
4872 #endif
4873
4874         ASSERT_NETISR0;
4875         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
4876
4877         ioc_rule->act_ofs = rule->act_ofs;
4878         ioc_rule->cmd_len = rule->cmd_len;
4879         ioc_rule->rulenum = rule->rulenum;
4880         ioc_rule->set = rule->set;
4881         ioc_rule->usr_flags = rule->usr_flags;
4882
4883         ioc_rule->set_disable = ctx->ipfw_set_disable;
4884         ioc_rule->static_count = static_count;
4885         ioc_rule->static_len = static_ioc_len;
4886
4887         /*
4888          * Visit (read-only) all of the rule's duplications to get
4889          * the necessary statistics
4890          */
4891 #ifdef INVARIANTS
4892         i = 0;
4893 #endif
4894         ioc_rule->pcnt = 0;
4895         ioc_rule->bcnt = 0;
4896         ioc_rule->timestamp = 0;
4897         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
4898                 ioc_rule->pcnt += sibling->pcnt;
4899                 ioc_rule->bcnt += sibling->bcnt;
4900                 if (sibling->timestamp > ioc_rule->timestamp)
4901                         ioc_rule->timestamp = sibling->timestamp;
4902 #ifdef INVARIANTS
4903                 ++i;
4904 #endif
4905         }
4906         KASSERT(i == netisr_ncpus,
4907             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
4908
4909         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
4910
4911         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
4912 }
4913
4914 static boolean_t
4915 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
4916 {
4917         struct ipfw_ioc_flowid *ioc_id;
4918
4919         if (trk->tc_expire == 0) {
4920                 /* Not a scanned one. */
4921                 return (FALSE);
4922         }
4923
4924         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
4925             0 : trk->tc_expire - time_uptime;
4926         ioc_state->pcnt = 0;
4927         ioc_state->bcnt = 0;
4928
4929         ioc_state->dyn_type = O_LIMIT_PARENT;
4930         ioc_state->count = trk->tc_count;
4931
4932         ioc_state->rulenum = trk->tc_rulenum;
4933
4934         ioc_id = &ioc_state->id;
4935         ioc_id->type = ETHERTYPE_IP;
4936         ioc_id->u.ip.proto = trk->tc_proto;
4937         ioc_id->u.ip.src_ip = trk->tc_saddr;
4938         ioc_id->u.ip.dst_ip = trk->tc_daddr;
4939         ioc_id->u.ip.src_port = trk->tc_sport;
4940         ioc_id->u.ip.dst_port = trk->tc_dport;
4941
4942         return (TRUE);
4943 }
4944
4945 static boolean_t
4946 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
4947 {
4948         struct ipfw_ioc_flowid *ioc_id;
4949
4950         if (s->st_type == O_ANCHOR)
4951                 return (FALSE);
4952
4953         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
4954             0 : s->st_expire - time_uptime;
4955         ioc_state->pcnt = s->st_pcnt;
4956         ioc_state->bcnt = s->st_bcnt;
4957
4958         ioc_state->dyn_type = s->st_type;
4959         ioc_state->count = 0;
4960
4961         ioc_state->rulenum = s->st_rule->rulenum;
4962
4963         ioc_id = &ioc_state->id;
4964         ioc_id->type = ETHERTYPE_IP;
4965         ioc_id->u.ip.proto = s->st_proto;
4966         ipfw_key_4tuple(&s->st_key,
4967             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
4968             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
4969
4970         return (TRUE);
4971 }
4972
4973 static void
4974 ipfw_state_copy_dispatch(netmsg_t nmsg)
4975 {
4976         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
4977         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4978         const struct ipfw_state *s;
4979         const struct ipfw_track *t;
4980
4981         ASSERT_NETISR_NCPUS(mycpuid);
4982         KASSERT(nm->state_cnt < nm->state_cntmax,
4983             ("invalid state count %d, max %d",
4984              nm->state_cnt, nm->state_cntmax));
4985
4986         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
4987                 if (ipfw_state_copy(s, nm->ioc_state)) {
4988                         nm->ioc_state++;
4989                         nm->state_cnt++;
4990                         if (nm->state_cnt == nm->state_cntmax)
4991                                 goto done;
4992                 }
4993         }
4994
4995         /*
4996          * Prepare tracks in the global track tree for userland.
4997          */
4998         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
4999                 struct ipfw_trkcnt *trk;
5000
5001                 if (t->t_count == NULL) /* anchor */
5002                         continue;
5003                 trk = t->t_trkcnt;
5004
5005                 /*
5006                  * Only one netisr can run this function at
5007                  * any time, and only this function accesses
5008                  * trkcnt's tc_expire, so this is safe w/o
5009                  * ipfw_gd.ipfw_trkcnt_token.
5010                  */
5011                 if (trk->tc_expire > t->t_expire)
5012                         continue;
5013                 trk->tc_expire = t->t_expire;
5014         }
5015
5016         /*
5017          * Copy tracks in the global track tree to userland in
5018          * the last netisr.
5019          */
5020         if (mycpuid == netisr_ncpus - 1) {
5021                 struct ipfw_trkcnt *trk;
5022
5023                 KASSERT(nm->state_cnt < nm->state_cntmax,
5024                     ("invalid state count %d, max %d",
5025                      nm->state_cnt, nm->state_cntmax));
5026
5027                 IPFW_TRKCNT_TOKGET;
5028                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5029                         if (ipfw_track_copy(trk, nm->ioc_state)) {
5030                                 nm->ioc_state++;
5031                                 nm->state_cnt++;
5032                                 if (nm->state_cnt == nm->state_cntmax) {
5033                                         IPFW_TRKCNT_TOKREL;
5034                                         goto done;
5035                                 }
5036                         }
5037                 }
5038                 IPFW_TRKCNT_TOKREL;
5039         }
5040 done:
5041         if (nm->state_cnt == nm->state_cntmax) {
5042                 /* No more space; done. */
5043                 netisr_replymsg(&nm->base, 0);
5044         } else {
5045                 netisr_forwardmsg(&nm->base, mycpuid + 1);
5046         }
5047 }
5048
5049 static int
5050 ipfw_ctl_get_rules(struct sockopt *sopt)
5051 {
5052         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5053         struct ip_fw *rule;
5054         void *bp;
5055         size_t size;
5056         int state_cnt;
5057
5058         ASSERT_NETISR0;
5059
5060         /*
5061          * pass up a copy of the current rules. Static rules
5062          * come first (the last of which has number IPFW_DEFAULT_RULE),
5063          * followed by a possibly empty list of states.
5064          */
5065
5066         size = static_ioc_len;  /* size of static rules */
5067
5068         /*
5069          * Size of the states.
5070          * XXX take tracks as state for userland compat.
5071          */
5072         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5073         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5074         size += state_cnt * sizeof(struct ipfw_ioc_state);
5075
5076         if (sopt->sopt_valsize < size) {
5077                 /* short length, no need to return incomplete rules */
5078                 /* XXX: if superuser, no need to zero buffer */
5079                 bzero(sopt->sopt_val, sopt->sopt_valsize);
5080                 return 0;
5081         }
5082         bp = sopt->sopt_val;
5083
5084         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5085                 bp = ipfw_copy_rule(ctx, rule, bp);
5086
5087         if (state_cnt) {
5088                 struct netmsg_cpstate nm;
5089 #ifdef INVARIANTS
5090                 size_t old_size = size;
5091 #endif
5092
5093                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5094                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
5095                 nm.ioc_state = bp;
5096                 nm.state_cntmax = state_cnt;
5097                 nm.state_cnt = 0;
5098                 netisr_domsg_global(&nm.base);
5099
5100                 /*
5101                  * The # of states may be shrinked after the snapshot
5102                  * of the state count was taken.  To give user a correct
5103                  * state count, nm->state_cnt is used to recalculate
5104                  * the actual size.
5105                  */
5106                 size = static_ioc_len +
5107                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5108                 KKASSERT(size <= old_size);
5109         }
5110
5111         sopt->sopt_valsize = size;
5112         return 0;
5113 }
5114
5115 static void
5116 ipfw_set_disable_dispatch(netmsg_t nmsg)
5117 {
5118         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5119
5120         ASSERT_NETISR_NCPUS(mycpuid);
5121
5122         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5123         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5124 }
5125
5126 static void
5127 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5128 {
5129         struct netmsg_base nmsg;
5130         uint32_t set_disable;
5131
5132         ASSERT_NETISR0;
5133
5134         /* IPFW_DEFAULT_SET is always enabled */
5135         enable |= (1 << IPFW_DEFAULT_SET);
5136         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5137
5138         bzero(&nmsg, sizeof(nmsg));
5139         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5140             ipfw_set_disable_dispatch);
5141         nmsg.lmsg.u.ms_result32 = set_disable;
5142
5143         netisr_domsg_global(&nmsg);
5144 }
5145
5146 static void
5147 ipfw_table_create_dispatch(netmsg_t nm)
5148 {
5149         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5150         int tblid = nm->lmsg.u.ms_result;
5151
5152         ASSERT_NETISR_NCPUS(mycpuid);
5153
5154         if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5155             rn_cpumaskhead(mycpuid), 32))
5156                 panic("ipfw: create table%d failed", tblid);
5157
5158         netisr_forwardmsg(&nm->base, mycpuid + 1);
5159 }
5160
5161 static int
5162 ipfw_table_create(struct sockopt *sopt)
5163 {
5164         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5165         struct ipfw_ioc_table *tbl;
5166         struct netmsg_base nm;
5167
5168         ASSERT_NETISR0;
5169
5170         if (sopt->sopt_valsize != sizeof(*tbl))
5171                 return (EINVAL);
5172
5173         tbl = sopt->sopt_val;
5174         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5175                 return (EINVAL);
5176
5177         if (ctx->ipfw_tables[tbl->tableid] != NULL)
5178                 return (EEXIST);
5179
5180         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5181             ipfw_table_create_dispatch);
5182         nm.lmsg.u.ms_result = tbl->tableid;
5183         netisr_domsg_global(&nm);
5184
5185         return (0);
5186 }
5187
5188 static void
5189 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5190 {
5191         struct radix_node *ret;
5192
5193         ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5194         if (ret != rn)
5195                 panic("deleted other table entry");
5196         kfree(ret, M_IPFW);
5197 }
5198
5199 static int
5200 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5201 {
5202
5203         ipfw_table_killrn(xrnh, rn);
5204         return (0);
5205 }
5206
5207 static void
5208 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5209     int destroy)
5210 {
5211         struct radix_node_head *rnh;
5212
5213         ASSERT_NETISR_NCPUS(mycpuid);
5214
5215         rnh = ctx->ipfw_tables[tableid];
5216         rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5217         if (destroy) {
5218                 Free(rnh);
5219                 ctx->ipfw_tables[tableid] = NULL;
5220         }
5221 }
5222
5223 static void
5224 ipfw_table_flush_dispatch(netmsg_t nmsg)
5225 {
5226         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5227         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5228
5229         ASSERT_NETISR_NCPUS(mycpuid);
5230
5231         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5232         netisr_forwardmsg(&nm->base, mycpuid + 1);
5233 }
5234
5235 static void
5236 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5237 {
5238         int i;
5239
5240         ASSERT_NETISR_NCPUS(mycpuid);
5241
5242         for (i = 0; i < ipfw_table_max; ++i) {
5243                 if (ctx->ipfw_tables[i] != NULL)
5244                         ipfw_table_flush_oncpu(ctx, i, destroy);
5245         }
5246 }
5247
5248 static void
5249 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5250 {
5251         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5252
5253         ASSERT_NETISR_NCPUS(mycpuid);
5254
5255         ipfw_table_flushall_oncpu(ctx, 0);
5256         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5257 }
5258
5259 static int
5260 ipfw_table_flush(struct sockopt *sopt)
5261 {
5262         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5263         struct ipfw_ioc_table *tbl;
5264         struct netmsg_tblflush nm;
5265
5266         ASSERT_NETISR0;
5267
5268         if (sopt->sopt_valsize != sizeof(*tbl))
5269                 return (EINVAL);
5270
5271         tbl = sopt->sopt_val;
5272         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5273                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5274                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5275                 netisr_domsg_global(&nm.base);
5276                 return (0);
5277         }
5278
5279         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5280                 return (EINVAL);
5281
5282         if (ctx->ipfw_tables[tbl->tableid] == NULL)
5283                 return (ENOENT);
5284
5285         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5286             ipfw_table_flush_dispatch);
5287         nm.tableid = tbl->tableid;
5288         nm.destroy = 0;
5289         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
5290                 nm.destroy = 1;
5291         netisr_domsg_global(&nm.base);
5292
5293         return (0);
5294 }
5295
5296 static int
5297 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
5298 {
5299         int *cnt = xcnt;
5300
5301         (*cnt)++;
5302         return (0);
5303 }
5304
5305 static int
5306 ipfw_table_cpent(struct radix_node *rn, void *xcp)
5307 {
5308         struct ipfw_table_cp *cp = xcp;
5309         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5310         struct ipfw_ioc_tblent *ioc_te;
5311 #ifdef INVARIANTS
5312         int cnt;
5313 #endif
5314
5315         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
5316             cp->te_idx, cp->te_cnt));
5317         ioc_te = &cp->te[cp->te_idx];
5318
5319         if (te->te_nodes->rn_mask != NULL) {
5320                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
5321                     *te->te_nodes->rn_mask);
5322         } else {
5323                 ioc_te->netmask.sin_len = 0;
5324         }
5325         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
5326
5327         ioc_te->use = te->te_use;
5328         ioc_te->last_used = te->te_lastuse;
5329 #ifdef INVARIANTS
5330         cnt = 1;
5331 #endif
5332
5333         while ((te = te->te_sibling) != NULL) {
5334 #ifdef INVARIANTS
5335                 ++cnt;
5336 #endif
5337                 ioc_te->use += te->te_use;
5338                 if (te->te_lastuse > ioc_te->last_used)
5339                         ioc_te->last_used = te->te_lastuse;
5340         }
5341         KASSERT(cnt == netisr_ncpus,
5342             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
5343
5344         cp->te_idx++;
5345
5346         return (0);
5347 }
5348
5349 static int
5350 ipfw_table_get(struct sockopt *sopt)
5351 {
5352         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5353         struct radix_node_head *rnh;
5354         struct ipfw_ioc_table *tbl;
5355         struct ipfw_ioc_tblcont *cont;
5356         struct ipfw_table_cp cp;
5357         int cnt = 0, sz;
5358
5359         ASSERT_NETISR0;
5360
5361         if (sopt->sopt_valsize < sizeof(*tbl))
5362                 return (EINVAL);
5363
5364         tbl = sopt->sopt_val;
5365         if (tbl->tableid < 0) {
5366                 struct ipfw_ioc_tbllist *list;
5367                 int i;
5368
5369                 /*
5370                  * List available table ids.
5371                  */
5372                 for (i = 0; i < ipfw_table_max; ++i) {
5373                         if (ctx->ipfw_tables[i] != NULL)
5374                                 ++cnt;
5375                 }
5376
5377                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
5378                 if (sopt->sopt_valsize < sz) {
5379                         bzero(sopt->sopt_val, sopt->sopt_valsize);
5380                         return (E2BIG);
5381                 }
5382                 list = sopt->sopt_val;
5383                 list->tablecnt = cnt;
5384
5385                 cnt = 0;
5386                 for (i = 0; i < ipfw_table_max; ++i) {
5387                         if (ctx->ipfw_tables[i] != NULL) {
5388                                 KASSERT(cnt < list->tablecnt,
5389                                     ("invalid idx %d, cnt %d",
5390                                      cnt, list->tablecnt));
5391                                 list->tables[cnt++] = i;
5392                         }
5393                 }
5394                 sopt->sopt_valsize = sz;
5395                 return (0);
5396         } else if (tbl->tableid >= ipfw_table_max) {
5397                 return (EINVAL);
5398         }
5399
5400         rnh = ctx->ipfw_tables[tbl->tableid];
5401         if (rnh == NULL)
5402                 return (ENOENT);
5403         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
5404
5405         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
5406         if (sopt->sopt_valsize < sz) {
5407                 bzero(sopt->sopt_val, sopt->sopt_valsize);
5408                 return (E2BIG);
5409         }
5410         cont = sopt->sopt_val;
5411         cont->entcnt = cnt;
5412
5413         cp.te = cont->ent;
5414         cp.te_idx = 0;
5415         cp.te_cnt = cnt;
5416         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
5417
5418         sopt->sopt_valsize = sz;
5419         return (0);
5420 }
5421
5422 static void
5423 ipfw_table_add_dispatch(netmsg_t nmsg)
5424 {
5425         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5426         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5427         struct radix_node_head *rnh;
5428         struct ipfw_tblent *te;
5429
5430         ASSERT_NETISR_NCPUS(mycpuid);
5431
5432         rnh = ctx->ipfw_tables[nm->tableid];
5433
5434         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
5435         te->te_nodes->rn_key = (char *)&te->te_key;
5436         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
5437
5438         if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
5439             te->te_nodes) == NULL) {
5440                 if (mycpuid == 0) {
5441                         kfree(te, M_IPFW);
5442                         netisr_replymsg(&nm->base, EEXIST);
5443                         return;
5444                 }
5445                 panic("rnh_addaddr failed");
5446         }
5447
5448         /* Link siblings. */
5449         if (nm->sibling != NULL)
5450                 nm->sibling->te_sibling = te;
5451         nm->sibling = te;
5452
5453         netisr_forwardmsg(&nm->base, mycpuid + 1);
5454 }
5455
5456 static void
5457 ipfw_table_del_dispatch(netmsg_t nmsg)
5458 {
5459         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5460         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5461         struct radix_node_head *rnh;
5462         struct radix_node *rn;
5463
5464         ASSERT_NETISR_NCPUS(mycpuid);
5465
5466         rnh = ctx->ipfw_tables[nm->tableid];
5467         rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
5468         if (rn == NULL) {
5469                 if (mycpuid == 0) {
5470                         netisr_replymsg(&nm->base, ESRCH);
5471                         return;
5472                 }
5473                 panic("rnh_deladdr failed");
5474         }
5475         kfree(rn, M_IPFW);
5476
5477         netisr_forwardmsg(&nm->base, mycpuid + 1);
5478 }
5479
5480 static int
5481 ipfw_table_alt(struct sockopt *sopt)
5482 {
5483         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5484         struct ipfw_ioc_tblcont *tbl;
5485         struct ipfw_ioc_tblent *te;
5486         struct sockaddr_in key0;
5487         struct sockaddr *netmask = NULL, *key;
5488         struct netmsg_tblent nm;
5489
5490         ASSERT_NETISR0;
5491
5492         if (sopt->sopt_valsize != sizeof(*tbl))
5493                 return (EINVAL);
5494         tbl = sopt->sopt_val;
5495
5496         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
5497                 return (EINVAL);
5498         if (tbl->entcnt != 1)
5499                 return (EINVAL);
5500
5501         if (ctx->ipfw_tables[tbl->tableid] == NULL)
5502                 return (ENOENT);
5503         te = &tbl->ent[0];
5504
5505         if (te->key.sin_family != AF_INET ||
5506             te->key.sin_port != 0 ||
5507             te->key.sin_len != sizeof(struct sockaddr_in))
5508                 return (EINVAL);
5509         key = (struct sockaddr *)&te->key;
5510
5511         if (te->netmask.sin_len != 0) {
5512                 if (te->netmask.sin_port != 0 ||
5513                     te->netmask.sin_len > sizeof(struct sockaddr_in))
5514                         return (EINVAL);
5515                 netmask = (struct sockaddr *)&te->netmask;
5516                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
5517                 key = (struct sockaddr *)&key0;
5518         }
5519
5520         if (sopt->sopt_name == IP_FW_TBL_ADD) {
5521                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5522                     MSGF_PRIORITY, ipfw_table_add_dispatch);
5523         } else {
5524                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5525                     MSGF_PRIORITY, ipfw_table_del_dispatch);
5526         }
5527         nm.key = key;
5528         nm.netmask = netmask;
5529         nm.tableid = tbl->tableid;
5530         nm.sibling = NULL;
5531         return (netisr_domsg_global(&nm.base));
5532 }
5533
5534 static int
5535 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
5536 {
5537         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5538
5539         te->te_use = 0;
5540         te->te_lastuse = 0;
5541         return (0);
5542 }
5543
5544 static void
5545 ipfw_table_zero_dispatch(netmsg_t nmsg)
5546 {
5547         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5548         struct radix_node_head *rnh;
5549
5550         ASSERT_NETISR_NCPUS(mycpuid);
5551
5552         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
5553         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5554
5555         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5556 }
5557
5558 static void
5559 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
5560 {
5561         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5562         int i;
5563
5564         ASSERT_NETISR_NCPUS(mycpuid);
5565
5566         for (i = 0; i < ipfw_table_max; ++i) {
5567                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5568
5569                 if (rnh != NULL)
5570                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5571         }
5572         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5573 }
5574
5575 static int
5576 ipfw_table_zero(struct sockopt *sopt)
5577 {
5578         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5579         struct netmsg_base nm;
5580         struct ipfw_ioc_table *tbl;
5581
5582         ASSERT_NETISR0;
5583
5584         if (sopt->sopt_valsize != sizeof(*tbl))
5585                 return (EINVAL);
5586         tbl = sopt->sopt_val;
5587
5588         if (tbl->tableid < 0) {
5589                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5590                     ipfw_table_zeroall_dispatch);
5591                 netisr_domsg_global(&nm);
5592                 return (0);
5593         } else if (tbl->tableid >= ipfw_table_max) {
5594                 return (EINVAL);
5595         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
5596                 return (ENOENT);
5597         }
5598
5599         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5600             ipfw_table_zero_dispatch);
5601         nm.lmsg.u.ms_result = tbl->tableid;
5602         netisr_domsg_global(&nm);
5603
5604         return (0);
5605 }
5606
5607 static int
5608 ipfw_table_killexp(struct radix_node *rn, void *xnm)
5609 {
5610         struct netmsg_tblexp *nm = xnm;
5611         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5612
5613         if (te->te_expired) {
5614                 ipfw_table_killrn(nm->rnh, rn);
5615                 nm->expcnt++;
5616         }
5617         return (0);
5618 }
5619
5620 static void
5621 ipfw_table_expire_dispatch(netmsg_t nmsg)
5622 {
5623         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5624         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5625         struct radix_node_head *rnh;
5626
5627         ASSERT_NETISR_NCPUS(mycpuid);
5628
5629         rnh = ctx->ipfw_tables[nm->tableid];
5630         nm->rnh = rnh;
5631         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5632
5633         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5634             ("not all expired addresses (%d) were deleted (%d)",
5635              nm->cnt * (mycpuid + 1), nm->expcnt));
5636
5637         netisr_forwardmsg(&nm->base, mycpuid + 1);
5638 }
5639
5640 static void
5641 ipfw_table_expireall_dispatch(netmsg_t nmsg)
5642 {
5643         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5644         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5645         int i;
5646
5647         ASSERT_NETISR_NCPUS(mycpuid);
5648
5649         for (i = 0; i < ipfw_table_max; ++i) {
5650                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5651
5652                 if (rnh == NULL)
5653                         continue;
5654                 nm->rnh = rnh;
5655                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5656         }
5657
5658         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5659             ("not all expired addresses (%d) were deleted (%d)",
5660              nm->cnt * (mycpuid + 1), nm->expcnt));
5661
5662         netisr_forwardmsg(&nm->base, mycpuid + 1);
5663 }
5664
5665 static int
5666 ipfw_table_markexp(struct radix_node *rn, void *xnm)
5667 {
5668         struct netmsg_tblexp *nm = xnm;
5669         struct ipfw_tblent *te;
5670         time_t lastuse;
5671
5672         te = (struct ipfw_tblent *)rn;
5673         lastuse = te->te_lastuse;
5674
5675         while ((te = te->te_sibling) != NULL) {
5676                 if (te->te_lastuse > lastuse)
5677                         lastuse = te->te_lastuse;
5678         }
5679         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
5680                 /* Not expired */
5681                 return (0);
5682         }
5683
5684         te = (struct ipfw_tblent *)rn;
5685         te->te_expired = 1;
5686         while ((te = te->te_sibling) != NULL)
5687                 te->te_expired = 1;
5688         nm->cnt++;
5689
5690         return (0);
5691 }
5692
5693 static int
5694 ipfw_table_expire(struct sockopt *sopt)
5695 {
5696         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5697         struct netmsg_tblexp nm;
5698         struct ipfw_ioc_tblexp *tbl;
5699         struct radix_node_head *rnh;
5700
5701         ASSERT_NETISR0;
5702
5703         if (sopt->sopt_valsize != sizeof(*tbl))
5704                 return (EINVAL);
5705         tbl = sopt->sopt_val;
5706         tbl->expcnt = 0;
5707
5708         nm.expcnt = 0;
5709         nm.cnt = 0;
5710         nm.expire = tbl->expire;
5711
5712         if (tbl->tableid < 0) {
5713                 int i;
5714
5715                 for (i = 0; i < ipfw_table_max; ++i) {
5716                         rnh = ctx->ipfw_tables[i];
5717                         if (rnh == NULL)
5718                                 continue;
5719                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5720                 }
5721                 if (nm.cnt == 0) {
5722                         /* No addresses can be expired. */
5723                         return (0);
5724                 }
5725                 tbl->expcnt = nm.cnt;
5726
5727                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5728                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
5729                 nm.tableid = -1;
5730                 netisr_domsg_global(&nm.base);
5731                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5732                     ("not all expired addresses (%d) were deleted (%d)",
5733                      nm.cnt * netisr_ncpus, nm.expcnt));
5734
5735                 return (0);
5736         } else if (tbl->tableid >= ipfw_table_max) {
5737                 return (EINVAL);
5738         }
5739
5740         rnh = ctx->ipfw_tables[tbl->tableid];
5741         if (rnh == NULL)
5742                 return (ENOENT);
5743         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5744         if (nm.cnt == 0) {
5745                 /* No addresses can be expired. */
5746                 return (0);
5747         }
5748         tbl->expcnt = nm.cnt;
5749
5750         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5751             ipfw_table_expire_dispatch);
5752         nm.tableid = tbl->tableid;
5753         netisr_domsg_global(&nm.base);
5754         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5755             ("not all expired addresses (%d) were deleted (%d)",
5756              nm.cnt * netisr_ncpus, nm.expcnt));
5757         return (0);
5758 }
5759
5760 static void
5761 ipfw_crossref_free_dispatch(netmsg_t nmsg)
5762 {
5763         struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
5764
5765         KKASSERT((rule->rule_flags &
5766             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5767             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5768         ipfw_free_rule(rule);
5769
5770         netisr_replymsg(&nmsg->base, 0);
5771 }
5772
5773 static void
5774 ipfw_crossref_reap(void)
5775 {
5776         struct ip_fw *rule, *prev = NULL;
5777
5778         ASSERT_NETISR0;
5779
5780         rule = ipfw_gd.ipfw_crossref_free;
5781         while (rule != NULL) {
5782                 uint64_t inflight = 0;
5783                 int i;
5784
5785                 for (i = 0; i < netisr_ncpus; ++i)
5786                         inflight += rule->cross_rules[i]->cross_refs;
5787                 if (inflight == 0) {
5788                         struct ip_fw *f = rule;
5789
5790                         /*
5791                          * Unlink.
5792                          */
5793                         rule = rule->next;
5794                         if (prev != NULL)
5795                                 prev->next = rule;
5796                         else
5797                                 ipfw_gd.ipfw_crossref_free = rule;
5798
5799                         /*
5800                          * Free.
5801                          */
5802                         for (i = 1; i < netisr_ncpus; ++i) {
5803                                 struct netmsg_base nm;
5804
5805                                 netmsg_init(&nm, NULL, &curthread->td_msgport,
5806                                     MSGF_PRIORITY, ipfw_crossref_free_dispatch);
5807                                 nm.lmsg.u.ms_resultp = f->cross_rules[i];
5808                                 netisr_domsg(&nm, i);
5809                         }
5810                         KKASSERT((f->rule_flags &
5811                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5812                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5813                         ipfw_unref_rule(f);
5814                 } else {
5815                         prev = rule;
5816                         rule = rule->next;
5817                 }
5818         }
5819
5820         if (ipfw_gd.ipfw_crossref_free != NULL) {
5821                 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
5822                     ipfw_crossref_timeo, NULL);
5823         }
5824 }
5825
5826 /*
5827  * {set|get}sockopt parser.
5828  */
5829 static int
5830 ipfw_ctl(struct sockopt *sopt)
5831 {
5832         int error, rulenum;
5833         uint32_t *masks;
5834         size_t size;
5835
5836         ASSERT_NETISR0;
5837
5838         error = 0;
5839
5840         switch (sopt->sopt_name) {
5841         case IP_FW_GET:
5842                 error = ipfw_ctl_get_rules(sopt);
5843                 break;
5844
5845         case IP_FW_FLUSH:
5846                 ipfw_flush(0 /* keep default rule */);
5847                 break;
5848
5849         case IP_FW_ADD:
5850                 error = ipfw_ctl_add_rule(sopt);
5851                 break;
5852
5853         case IP_FW_DEL:
5854                 /*
5855                  * IP_FW_DEL is used for deleting single rules or sets,
5856                  * and (ab)used to atomically manipulate sets.
5857                  * Argument size is used to distinguish between the two:
5858                  *    sizeof(uint32_t)
5859                  *      delete single rule or set of rules,
5860                  *      or reassign rules (or sets) to a different set.
5861                  *    2 * sizeof(uint32_t)
5862                  *      atomic disable/enable sets.
5863                  *      first uint32_t contains sets to be disabled,
5864                  *      second uint32_t contains sets to be enabled.
5865                  */
5866                 masks = sopt->sopt_val;
5867                 size = sopt->sopt_valsize;
5868                 if (size == sizeof(*masks)) {
5869                         /*
5870                          * Delete or reassign static rule
5871                          */
5872                         error = ipfw_ctl_alter(masks[0]);
5873                 } else if (size == (2 * sizeof(*masks))) {
5874                         /*
5875                          * Set enable/disable
5876                          */
5877                         ipfw_ctl_set_disable(masks[0], masks[1]);
5878                 } else {
5879                         error = EINVAL;
5880                 }
5881                 break;
5882
5883         case IP_FW_ZERO:
5884         case IP_FW_RESETLOG: /* argument is an int, the rule number */
5885                 rulenum = 0;
5886
5887                 if (sopt->sopt_val != 0) {
5888                     error = soopt_to_kbuf(sopt, &rulenum,
5889                             sizeof(int), sizeof(int));
5890                     if (error)
5891                         break;
5892                 }
5893                 error = ipfw_ctl_zero_entry(rulenum,
5894                         sopt->sopt_name == IP_FW_RESETLOG);
5895                 break;
5896
5897         case IP_FW_TBL_CREATE:
5898                 error = ipfw_table_create(sopt);
5899                 break;
5900
5901         case IP_FW_TBL_ADD:
5902         case IP_FW_TBL_DEL:
5903                 error = ipfw_table_alt(sopt);
5904                 break;
5905
5906         case IP_FW_TBL_FLUSH:
5907         case IP_FW_TBL_DESTROY:
5908                 error = ipfw_table_flush(sopt);
5909                 break;
5910
5911         case IP_FW_TBL_GET:
5912                 error = ipfw_table_get(sopt);
5913                 break;
5914
5915         case IP_FW_TBL_ZERO:
5916                 error = ipfw_table_zero(sopt);
5917                 break;
5918
5919         case IP_FW_TBL_EXPIRE:
5920                 error = ipfw_table_expire(sopt);
5921                 break;
5922
5923         default:
5924                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
5925                 error = EINVAL;
5926         }
5927
5928         ipfw_crossref_reap();
5929         return error;
5930 }
5931
5932 static void
5933 ipfw_keepalive_done(struct ipfw_context *ctx)
5934 {
5935
5936         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5937             ("keepalive is not in progress"));
5938         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
5939         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
5940             ipfw_keepalive, NULL);
5941 }
5942
5943 static void
5944 ipfw_keepalive_more(struct ipfw_context *ctx)
5945 {
5946         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
5947
5948         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5949             ("keepalive is not in progress"));
5950         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
5951             ("keepalive more did not finish"));
5952         netisr_sendmsg_oncpu(nm);
5953 }
5954
5955 static void
5956 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
5957 {
5958         struct ipfw_state *s;
5959         int scanned = 0, expired = 0, kept = 0;
5960
5961         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5962             ("keepalive is not in progress"));
5963
5964         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
5965                 uint32_t ack_rev, ack_fwd;
5966                 struct ipfw_flow_id id;
5967
5968                 if (scanned++ >= ipfw_state_scan_max) {
5969                         ipfw_keepalive_more(ctx);
5970                         return;
5971                 }
5972
5973                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5974                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
5975
5976                 if (s->st_type == O_ANCHOR)
5977                         continue;
5978
5979                 if (TIME_LEQ(s->st_expire, time_uptime)) {
5980                         /* State expired. */
5981                         ipfw_state_del(ctx, s);
5982                         if (++expired >= ipfw_state_expire_max) {
5983                                 ipfw_keepalive_more(ctx);
5984                                 return;
5985                         }
5986                         continue;
5987                 }
5988
5989                 /*
5990                  * Keep alive processing
5991                  */
5992
5993                 if (s->st_proto != IPPROTO_TCP)
5994                         continue;
5995                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
5996                         continue;
5997                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
5998                     s->st_expire))
5999                         continue;       /* too early */
6000
6001                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6002                     &id.dst_ip, &id.dst_port);
6003                 ack_rev = s->st_ack_rev;
6004                 ack_fwd = s->st_ack_fwd;
6005
6006                 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6007                 send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6008
6009                 if (++kept >= ipfw_keepalive_max) {
6010                         ipfw_keepalive_more(ctx);
6011                         return;
6012                 }
6013         }
6014         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6015         ipfw_keepalive_done(ctx);
6016 }
6017
6018 static void
6019 ipfw_keepalive_more_dispatch(netmsg_t nm)
6020 {
6021         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6022         struct ipfw_state *anchor;
6023
6024         ASSERT_NETISR_NCPUS(mycpuid);
6025         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6026             ("keepalive is not in progress"));
6027
6028         /* Reply ASAP */
6029         netisr_replymsg(&nm->base, 0);
6030
6031         anchor = &ctx->ipfw_keepalive_anch;
6032         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6033                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6034                 ipfw_keepalive_done(ctx);
6035                 return;
6036         }
6037         ipfw_keepalive_loop(ctx, anchor);
6038 }
6039
6040 /*
6041  * This procedure is only used to handle keepalives. It is invoked
6042  * every dyn_keepalive_period
6043  */
6044 static void
6045 ipfw_keepalive_dispatch(netmsg_t nm)
6046 {
6047         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6048         struct ipfw_state *anchor;
6049
6050         ASSERT_NETISR_NCPUS(mycpuid);
6051         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6052             ("keepalive is in progress"));
6053         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6054
6055         /* Reply ASAP */
6056         crit_enter();
6057         netisr_replymsg(&nm->base, 0);
6058         crit_exit();
6059
6060         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6061                 ipfw_keepalive_done(ctx);
6062                 return;
6063         }
6064
6065         anchor = &ctx->ipfw_keepalive_anch;
6066         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6067         ipfw_keepalive_loop(ctx, anchor);
6068 }
6069
6070 /*
6071  * This procedure is only used to handle keepalives. It is invoked
6072  * every dyn_keepalive_period
6073  */
6074 static void
6075 ipfw_keepalive(void *dummy __unused)
6076 {
6077         struct netmsg_base *msg;
6078
6079         KKASSERT(mycpuid < netisr_ncpus);
6080         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6081
6082         crit_enter();
6083         if (msg->lmsg.ms_flags & MSGF_DONE)
6084                 netisr_sendmsg_oncpu(msg);
6085         crit_exit();
6086 }
6087
6088 static void
6089 ipfw_ip_input_dispatch(netmsg_t nmsg)
6090 {
6091         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6092         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6093         struct mbuf *m = nm->m;
6094         struct ip_fw *rule = nm->arg1;
6095
6096         ASSERT_NETISR_NCPUS(mycpuid);
6097         KASSERT(rule->cpuid == mycpuid,
6098             ("rule does not belong to cpu%d", mycpuid));
6099         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6100             ("mbuf does not have ipfw continue rule"));
6101
6102         KASSERT(ctx->ipfw_cont_rule == NULL,
6103             ("pending ipfw continue rule"));
6104         ctx->ipfw_cont_rule = rule;
6105         ip_input(m);
6106
6107         /*
6108          * This rule is no longer used; decrement its cross_refs,
6109          * so this rule can be deleted.
6110          */
6111         rule->cross_refs--;
6112
6113         /* May not be cleared, if ipfw was unload/disabled. */
6114         ctx->ipfw_cont_rule = NULL;
6115 }
6116
6117 static int
6118 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6119 {
6120         struct ip_fw_args args;
6121         struct mbuf *m = *m0;
6122         struct m_tag *mtag;
6123         int tee = 0, error = 0, ret, cpuid;
6124         struct netmsg_genpkt *nm;
6125
6126         args.cont = 0;
6127         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6128                 /* Extract info from dummynet tag */
6129                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6130                 KKASSERT(mtag != NULL);
6131                 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6132                 KKASSERT(args.rule != NULL);
6133
6134                 m_tag_delete(m, mtag);
6135                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6136         } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6137                 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6138
6139                 KKASSERT(ctx->ipfw_cont_rule != NULL);
6140                 args.rule = ctx->ipfw_cont_rule;
6141                 ctx->ipfw_cont_rule = NULL;
6142
6143                 args.cont = 1;
6144                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6145         } else {
6146                 args.rule = NULL;
6147         }
6148
6149         args.eh = NULL;
6150         args.oif = NULL;
6151         args.m = m;
6152         ret = ipfw_chk(&args);
6153         m = args.m;
6154
6155         if (m == NULL) {
6156                 error = EACCES;
6157                 goto back;
6158         }
6159
6160         switch (ret) {
6161         case IP_FW_PASS:
6162                 break;
6163
6164         case IP_FW_DENY:
6165                 m_freem(m);
6166                 m = NULL;
6167                 error = EACCES;
6168                 break;
6169
6170         case IP_FW_DUMMYNET:
6171                 /* Send packet to the appropriate pipe */
6172                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6173                 break;
6174
6175         case IP_FW_TEE:
6176                 tee = 1;
6177                 /* FALL THROUGH */
6178
6179         case IP_FW_DIVERT:
6180                 /*
6181                  * Must clear bridge tag when changing
6182                  */
6183                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6184                 if (ip_divert_p != NULL) {
6185                         m = ip_divert_p(m, tee, 1);
6186                 } else {
6187                         m_freem(m);
6188                         m = NULL;
6189                         /* not sure this is the right error msg */
6190                         error = EACCES;
6191                 }
6192                 break;
6193
6194         case IP_FW_CONTINUE:
6195                 KASSERT(m->m_flags & M_HASH, ("no hash"));
6196                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
6197                 KASSERT(cpuid != mycpuid,
6198                     ("continue on the same cpu%d", cpuid));
6199
6200                 /*
6201                  * NOTE:
6202                  * Bump cross_refs to prevent this rule and its siblings
6203                  * from being deleted, while this mbuf is inflight.  The
6204                  * cross_refs of the sibling rule on the target cpu will
6205                  * be decremented, once this mbuf is going to be filtered
6206                  * on the target cpu.
6207                  */
6208                 args.rule->cross_refs++;
6209                 m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6210
6211                 nm = &m->m_hdr.mh_genmsg;
6212                 netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6213                     ipfw_ip_input_dispatch);
6214                 nm->m = m;
6215                 nm->arg1 = args.rule->cross_rules[cpuid];
6216                 netisr_sendmsg(&nm->base, cpuid);
6217
6218                 /* This mbuf is dispatched; no longer valid. */
6219                 m = NULL;
6220                 break;
6221
6222         default:
6223                 panic("unknown ipfw return value: %d", ret);
6224         }
6225 back:
6226         *m0 = m;
6227         return error;
6228 }
6229
6230 static int
6231 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6232 {
6233         struct ip_fw_args args;
6234         struct mbuf *m = *m0;
6235         struct m_tag *mtag;
6236         int tee = 0, error = 0, ret;
6237
6238         args.cont = 0;
6239         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6240                 /* Extract info from dummynet tag */
6241                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6242                 KKASSERT(mtag != NULL);
6243                 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6244                 KKASSERT(args.rule != NULL);
6245
6246                 m_tag_delete(m, mtag);
6247                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6248         } else {
6249                 args.rule = NULL;
6250         }
6251
6252         args.eh = NULL;
6253         args.m = m;
6254         args.oif = ifp;
6255         ret = ipfw_chk(&args);
6256         m = args.m;
6257
6258         if (m == NULL) {
6259                 error = EACCES;
6260                 goto back;
6261         }
6262
6263         switch (ret) {
6264         case IP_FW_PASS:
6265                 break;
6266
6267         case IP_FW_DENY:
6268                 m_freem(m);
6269                 m = NULL;
6270                 error = EACCES;
6271                 break;
6272
6273         case IP_FW_DUMMYNET:
6274                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
6275                 break;
6276
6277         case IP_FW_TEE:
6278                 tee = 1;
6279                 /* FALL THROUGH */
6280
6281         case IP_FW_DIVERT:
6282                 if (ip_divert_p != NULL) {
6283                         m = ip_divert_p(m, tee, 0);
6284                 } else {
6285                         m_freem(m);
6286                         m = NULL;
6287                         /* not sure this is the right error msg */
6288                         error = EACCES;
6289                 }
6290                 break;
6291
6292         default:
6293                 panic("unknown ipfw return value: %d", ret);
6294         }
6295 back:
6296         *m0 = m;
6297         return error;
6298 }
6299
6300 static void
6301 ipfw_hook(void)
6302 {
6303         struct pfil_head *pfh;
6304
6305         ASSERT_NETISR0;
6306
6307         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6308         if (pfh == NULL)
6309                 return;
6310
6311         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6312         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6313 }
6314
6315 static void
6316 ipfw_dehook(void)
6317 {
6318         struct pfil_head *pfh;
6319
6320         ASSERT_NETISR0;
6321
6322         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6323         if (pfh == NULL)
6324                 return;
6325
6326         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6327         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6328 }
6329
6330 static int
6331 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
6332 {
6333         int dyn_cnt;
6334
6335         dyn_cnt = ipfw_state_cntcoll();
6336         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
6337
6338         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
6339 }
6340
6341 static int
6342 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
6343 {
6344         int state_cnt;
6345
6346         state_cnt = ipfw_state_cntcoll();
6347         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
6348 }
6349
6350 static int
6351 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
6352 {
6353         int state_max, error;
6354
6355         state_max = ipfw_state_max;
6356         error = sysctl_handle_int(oidp, &state_max, 0, req);
6357         if (error || req->newptr == NULL)
6358                 return (error);
6359
6360         if (state_max < 1)
6361                 return (EINVAL);
6362
6363         ipfw_state_max_set(state_max);
6364         return (0);
6365 }
6366
6367 static int
6368 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
6369 {
6370         int dyn_max, error;
6371
6372         dyn_max = ipfw_state_max + ipfw_track_max;
6373
6374         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
6375         if (error || req->newptr == NULL)
6376                 return (error);
6377
6378         if (dyn_max < 2)
6379                 return (EINVAL);
6380
6381         ipfw_state_max_set(dyn_max / 2);
6382         ipfw_track_max = dyn_max / 2;
6383         return (0);
6384 }
6385
6386 static void
6387 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
6388 {
6389         int enable = nmsg->lmsg.u.ms_result;
6390
6391         ASSERT_NETISR0;
6392
6393         if (fw_enable == enable)
6394                 goto reply;
6395
6396         fw_enable = enable;
6397         if (fw_enable)
6398                 ipfw_hook();
6399         else
6400                 ipfw_dehook();
6401 reply:
6402         netisr_replymsg(&nmsg->base, 0);
6403 }
6404
6405 static int
6406 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
6407 {
6408         struct netmsg_base nmsg;
6409         int enable, error;
6410
6411         enable = fw_enable;
6412         error = sysctl_handle_int(oidp, &enable, 0, req);
6413         if (error || req->newptr == NULL)
6414                 return error;
6415
6416         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6417             ipfw_sysctl_enable_dispatch);
6418         nmsg.lmsg.u.ms_result = enable;
6419
6420         return netisr_domsg(&nmsg, 0);
6421 }
6422
6423 static int
6424 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
6425 {
6426         return sysctl_int_range(oidp, arg1, arg2, req,
6427                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
6428 }
6429
6430 static int
6431 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
6432 {
6433
6434         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
6435 }
6436
6437 static int
6438 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
6439 {
6440         u_long stat = 0;
6441         int cpu, error;
6442
6443         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6444                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
6445
6446         error = sysctl_handle_long(oidp, &stat, 0, req);
6447         if (error || req->newptr == NULL)
6448                 return (error);
6449
6450         /* Zero out this stat. */
6451         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6452                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
6453         return (0);
6454 }
6455
6456 static void
6457 ipfw_ctx_init_dispatch(netmsg_t nmsg)
6458 {
6459         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
6460         struct ipfw_context *ctx;
6461         struct ip_fw *def_rule;
6462
6463         ASSERT_NETISR_NCPUS(mycpuid);
6464
6465         ctx = kmalloc(__offsetof(struct ipfw_context,
6466             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
6467
6468         RB_INIT(&ctx->ipfw_state_tree);
6469         TAILQ_INIT(&ctx->ipfw_state_list);
6470
6471         RB_INIT(&ctx->ipfw_track_tree);
6472         TAILQ_INIT(&ctx->ipfw_track_list);
6473
6474         callout_init_mp(&ctx->ipfw_stateto_ch);
6475         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
6476             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
6477         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
6478         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
6479             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
6480
6481         callout_init_mp(&ctx->ipfw_trackto_ch);
6482         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
6483             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
6484         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
6485             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
6486
6487         callout_init_mp(&ctx->ipfw_keepalive_ch);
6488         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
6489             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
6490         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
6491         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
6492             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
6493
6494         ipfw_ctx[mycpuid] = ctx;
6495
6496         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
6497
6498         def_rule->act_ofs = 0;
6499         def_rule->rulenum = IPFW_DEFAULT_RULE;
6500         def_rule->cmd_len = 1;
6501         def_rule->set = IPFW_DEFAULT_SET;
6502
6503         def_rule->cmd[0].len = 1;
6504 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
6505         def_rule->cmd[0].opcode = O_ACCEPT;
6506 #else
6507         if (filters_default_to_accept)
6508                 def_rule->cmd[0].opcode = O_ACCEPT;
6509         else
6510                 def_rule->cmd[0].opcode = O_DENY;
6511 #endif
6512
6513         def_rule->refcnt = 1;
6514         def_rule->cpuid = mycpuid;
6515
6516         /* Install the default rule */
6517         ctx->ipfw_default_rule = def_rule;
6518         ctx->ipfw_layer3_chain = def_rule;
6519
6520         /* Link rule CPU sibling */
6521         ipfw_link_sibling(fwmsg, def_rule);
6522
6523         /* Statistics only need to be updated once */
6524         if (mycpuid == 0)
6525                 ipfw_inc_static_count(def_rule);
6526
6527         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6528 }
6529
6530 static void
6531 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
6532 {
6533
6534         crit_enter();
6535         /* Reply ASAP */
6536         netisr_replymsg(&nmsg->base, 0);
6537         crit_exit();
6538         ipfw_crossref_reap();
6539 }
6540
6541 static void
6542 ipfw_crossref_timeo(void *dummy __unused)
6543 {
6544         struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
6545
6546         KKASSERT(mycpuid == 0);
6547
6548         crit_enter();
6549         if (msg->lmsg.ms_flags & MSGF_DONE)
6550                 netisr_sendmsg_oncpu(msg);
6551         crit_exit();
6552 }
6553
6554 static void
6555 ipfw_ifaddr_dispatch(netmsg_t nmsg)
6556 {
6557         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6558         struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
6559         struct ip_fw *f;
6560
6561         ASSERT_NETISR_NCPUS(mycpuid);
6562
6563         for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
6564                 int l, cmdlen;
6565                 ipfw_insn *cmd;
6566
6567                 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
6568                         continue;
6569
6570                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
6571                      l -= cmdlen, cmd += cmdlen) {
6572                         cmdlen = F_LEN(cmd);
6573                         if (cmd->opcode == O_IP_SRC_IFIP ||
6574                             cmd->opcode == O_IP_DST_IFIP) {
6575                                 if (strncmp(ifp->if_xname,
6576                                     ((ipfw_insn_ifip *)cmd)->ifname,
6577                                     IFNAMSIZ) == 0)
6578                                         cmd->arg1 &= ~IPFW_IFIP_VALID;
6579                         }
6580                 }
6581         }
6582         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6583 }
6584
6585 static void
6586 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
6587     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
6588 {
6589         struct netmsg_base nm;
6590
6591         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6592             ipfw_ifaddr_dispatch);
6593         nm.lmsg.u.ms_resultp = ifp;
6594         netisr_domsg_global(&nm);
6595 }
6596
6597 static void
6598 ipfw_init_dispatch(netmsg_t nmsg)
6599 {
6600         struct netmsg_ipfw fwmsg;
6601         int error = 0, cpu;
6602
6603         ASSERT_NETISR0;
6604
6605         if (IPFW_LOADED) {
6606                 kprintf("IP firewall already loaded\n");
6607                 error = EEXIST;
6608                 goto reply;
6609         }
6610
6611         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
6612                 ipfw_table_max = UINT16_MAX;
6613
6614         /* Initialize global track tree. */
6615         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
6616         IPFW_TRKCNT_TOKINIT;
6617
6618         /* GC for freed crossref rules. */
6619         callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
6620         netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
6621             MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
6622
6623         ipfw_state_max_set(ipfw_state_max);
6624         ipfw_state_headroom = 8 * netisr_ncpus;
6625
6626         bzero(&fwmsg, sizeof(fwmsg));
6627         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6628             ipfw_ctx_init_dispatch);
6629         netisr_domsg_global(&fwmsg.base);
6630
6631         ip_fw_chk_ptr = ipfw_chk;
6632         ip_fw_ctl_ptr = ipfw_ctl;
6633         ip_fw_dn_io_ptr = ipfw_dummynet_io;
6634
6635         kprintf("ipfw2 initialized, default to %s, logging ",
6636                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
6637                 O_ACCEPT ? "accept" : "deny");
6638
6639 #ifdef IPFIREWALL_VERBOSE
6640         fw_verbose = 1;
6641 #endif
6642 #ifdef IPFIREWALL_VERBOSE_LIMIT
6643         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
6644 #endif
6645         if (fw_verbose == 0) {
6646                 kprintf("disabled\n");
6647         } else if (verbose_limit == 0) {
6648                 kprintf("unlimited\n");
6649         } else {
6650                 kprintf("limited to %d packets/entry by default\n",
6651                         verbose_limit);
6652         }
6653
6654         ip_fw_loaded = 1;
6655         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
6656                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
6657                     ipfw_state_expire_ipifunc, NULL, cpu);
6658                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
6659                     ipfw_track_expire_ipifunc, NULL, cpu);
6660                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
6661                     ipfw_keepalive, NULL, cpu);
6662         }
6663
6664         if (fw_enable)
6665                 ipfw_hook();
6666
6667         ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
6668             NULL, EVENTHANDLER_PRI_ANY);
6669         if (ipfw_ifaddr_event == NULL)
6670                 kprintf("ipfw: ifaddr_event register failed\n");
6671
6672 reply:
6673         netisr_replymsg(&nmsg->base, error);
6674 }
6675
6676 static int
6677 ipfw_init(void)
6678 {
6679         struct netmsg_base smsg;
6680
6681         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6682             ipfw_init_dispatch);
6683         return netisr_domsg(&smsg, 0);
6684 }
6685
6686 #ifdef KLD_MODULE
6687
6688 static void
6689 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
6690 {
6691         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6692
6693         ASSERT_NETISR_NCPUS(mycpuid);
6694
6695         callout_stop_sync(&ctx->ipfw_stateto_ch);
6696         callout_stop_sync(&ctx->ipfw_trackto_ch);
6697         callout_stop_sync(&ctx->ipfw_keepalive_ch);
6698
6699         crit_enter();
6700         netisr_dropmsg(&ctx->ipfw_stateexp_more);
6701         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
6702         netisr_dropmsg(&ctx->ipfw_trackexp_more);
6703         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
6704         netisr_dropmsg(&ctx->ipfw_keepalive_more);
6705         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
6706         crit_exit();
6707
6708         ipfw_table_flushall_oncpu(ctx, 1);
6709
6710         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6711 }
6712
6713 static void
6714 ipfw_fini_dispatch(netmsg_t nmsg)
6715 {
6716         struct netmsg_base nm;
6717         int error = 0, cpu;
6718
6719         ASSERT_NETISR0;
6720
6721         ipfw_crossref_reap();
6722
6723         if (ipfw_gd.ipfw_refcnt != 0) {
6724                 error = EBUSY;
6725                 goto reply;
6726         }
6727
6728         ip_fw_loaded = 0;
6729         ipfw_dehook();
6730
6731         /* Synchronize any inflight state/track expire IPIs. */
6732         lwkt_synchronize_ipiqs("ipfwfini");
6733
6734         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6735             ipfw_ctx_fini_dispatch);
6736         netisr_domsg_global(&nm);
6737
6738         callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
6739         crit_enter();
6740         netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
6741         crit_exit();
6742
6743         if (ipfw_ifaddr_event != NULL)
6744                 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
6745
6746         ip_fw_chk_ptr = NULL;
6747         ip_fw_ctl_ptr = NULL;
6748         ip_fw_dn_io_ptr = NULL;
6749         ipfw_flush(1 /* kill default rule */);
6750
6751         /* Free pre-cpu context */
6752         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6753                 kfree(ipfw_ctx[cpu], M_IPFW);
6754
6755         kprintf("IP firewall unloaded\n");
6756 reply:
6757         netisr_replymsg(&nmsg->base, error);
6758 }
6759
6760 static int
6761 ipfw_fini(void)
6762 {
6763         struct netmsg_base smsg;
6764
6765         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6766             ipfw_fini_dispatch);
6767         return netisr_domsg(&smsg, 0);
6768 }
6769
6770 #endif  /* KLD_MODULE */
6771
6772 static int
6773 ipfw_modevent(module_t mod, int type, void *unused)
6774 {
6775         int err = 0;
6776
6777         switch (type) {
6778         case MOD_LOAD:
6779                 err = ipfw_init();
6780                 break;
6781
6782         case MOD_UNLOAD:
6783 #ifndef KLD_MODULE
6784                 kprintf("ipfw statically compiled, cannot unload\n");
6785                 err = EBUSY;
6786 #else
6787                 err = ipfw_fini();
6788 #endif
6789                 break;
6790         default:
6791                 break;
6792         }
6793         return err;
6794 }
6795
6796 static moduledata_t ipfwmod = {
6797         "ipfw",
6798         ipfw_modevent,
6799         0
6800 };
6801 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
6802 MODULE_VERSION(ipfw, 1);